diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index cab6a16641..dab4957b2f 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -15,10 +15,10 @@ jobs: linters: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup up Python ${{ matrix.python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} @@ -36,8 +36,9 @@ jobs: - name: Check Sphinx Gallery cache run: python docs/src/check_gallery.py - build: - timeout-minutes: 30 + + multibuild: + timeout-minutes: 35 runs-on: ${{ matrix.os }} defaults: run: @@ -48,9 +49,6 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] - os: [ubuntu-latest, macos-latest, windows-latest] - platform: [x64] include: # # We want the _oldest_ possible manylinux version to ensure our @@ -76,40 +74,34 @@ jobs: # - os: ubuntu-latest manylinux-version: 2010 - python-version: 3.7 - build-depends: numpy==1.17.0 - - - os: ubuntu-latest - manylinux-version: 2010 - python-version: 3.8 + python-version: "3.8" build-depends: numpy==1.17.3 - os: ubuntu-latest manylinux-version: 2010 - python-version: 3.9 + python-version: "3.9" build-depends: numpy==1.19.3 - os: ubuntu-latest manylinux-version: 2014 python-version: "3.10" build-depends: numpy==1.22.2 scipy==1.8.0 + + - os: ubuntu-latest + manylinux-version: 2014 + python-version: "3.11" + build-depends: numpy==1.23.2 scipy==1.9.2 - os: macos-latest travis-os-name: osx manylinux-version: 1 - python-version: 3.7 - build-depends: numpy==1.17.0 - - - os: macos-latest - travis-os-name: osx - manylinux-version: 1 - python-version: 3.8 + python-version: "3.8" build-depends: numpy==1.17.3 - os: macos-latest travis-os-name: osx manylinux-version: 1 - python-version: 3.9 + python-version: "3.9" build-depends: numpy==1.19.3 - os: macos-latest @@ -117,45 +109,33 @@ jobs: manylinux-version: 1 python-version: "3.10" build-depends: numpy==1.22.2 scipy==1.8.0 - - - os: windows-latest - manylinux-version: 2010 - python-version: 3.7 - build-depends: numpy==1.17.0 - - - os: windows-latest - manylinux-version: 2010 - python-version: 3.8 - build-depends: numpy==1.17.3 - - - os: windows-latest - manylinux-version: 2010 - python-version: 3.9 - build-depends: numpy==1.19.3 - - - os: windows-latest - manylinux-version: 2010 - python-version: "3.10" - build-depends: numpy==1.22.2 scipy==1.8.0 + + - os: macos-latest + travis-os-name: osx + manylinux-version: 1 + python-version: "3.11" + build-depends: numpy==1.23.2 scipy==1.9.2 env: - PKG_NAME: gensim - REPO_DIR: gensim - BUILD_COMMIT: HEAD - PLAT: x86_64 - UNICODE_WIDTH: 32 - MB_PYTHON_VERSION: ${{ matrix.python-version }} # MB_PYTHON_VERSION is needed by Multibuild + SKIP_NETWORK_TESTS: 1 TEST_DEPENDS: pytest mock testfixtures + BUILD_DEPENDS: ${{ matrix.build-depends }} + + # + # For multibuild + # + BUILD_COMMIT: HEAD DOCKER_TEST_IMAGE: multibuild/xenial_x86_64 - TRAVIS_OS_NAME: ${{ matrix.travis-os-name }} - SKIP_NETWORK_TESTS: 1 MB_ML_VER: ${{ matrix.manylinux-version }} - WHEELHOUSE_UPLOADER_USERNAME: ${{ secrets.AWS_ACCESS_KEY_ID }} - WHEELHOUSE_UPLOADER_SECRET: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - BUILD_DEPENDS: ${{ matrix.build-depends }} + MB_PYTHON_VERSION: ${{ matrix.python-version }} # MB_PYTHON_VERSION is needed by Multibuild + PKG_NAME: gensim + PLAT: x86_64 + REPO_DIR: gensim + TRAVIS_OS_NAME: ${{ matrix.travis-os-name }} + UNICODE_WIDTH: 32 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: recursive fetch-depth: 0 @@ -168,15 +148,14 @@ jobs: echo "TRAVIS_OS_NAME: ${TRAVIS_OS_NAME}" echo "SKIP_NETWORK_TESTS: ${SKIP_NETWORK_TESTS}" - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip pip install virtualenv - - name: Build Wheel (Multibuild) - if: matrix.os != 'windows-latest' + - name: Build Wheel run: | echo ::group::Set up Multibuild source multibuild/common_utils.sh @@ -191,32 +170,125 @@ jobs: build_wheel $REPO_DIR ${{ matrix.PLAT }} echo ::endgroup:: + - name: Prepare for testing + run: | + # + # FIXME: Why are these eggs here? + # + # These eggs prevent the wheel from building and running on Py3.10 + # + find . -type f -name "*.egg" -exec rm -v {} \; + python -m venv test_environment + # - # We can't use multibuild on Windows, so we have to roll our own build script. - # Adapted from - # https://github.com/RaRe-Technologies/gensim-wheels/commit/084b863390edee05bbe15d4ec05d1ab726e52202 + # Multibuild has a test step but it essentially just installs the wheel + # and runs the test, and requires a lot of magic to get it working. + # It also does not work under Windows. + # So, we create our own simple test step here. # - - name: Build Wheel (Windows) - if: matrix.os == 'windows-latest' + - name: Install and Test Wheel + run: | + . test_environment/bin/activate + python -m pip install --upgrade pip + pip install pytest testfixtures mock + pip install wheelhouse/*.whl + cd test_environment + python -c 'import gensim;print(gensim.__version__)' + # + # This part relies on the wheel containing tests and required data. + # If we remove that from the wheel, we'll need to rewrite this step. + # + pytest -rfxEXs --durations=20 --disable-warnings --showlocals --pyargs gensim + + - name: Upload wheels to s3://gensim-wheels + # + # Only do this if the credentials are set. + # This means that PRs will still build wheels, but not upload them. + # (PRs do not have access to secrets). + # + # The always() ensures this step runs even if a previous step fails. + # We want to upload wheels whenever possible (even if e.g. tests failed) + # because we don't want an innocuous test failure from blocking a release. + # + if: ${{ always() && env.WHEELHOUSE_UPLOADER_USERNAME && env.WHEELHOUSE_UPLOADER_SECRET }} + run: | + python -m pip install wheelhouse-uploader + ls wheelhouse/*.whl + python -m wheelhouse_uploader upload --local-folder wheelhouse/ --no-ssl-check gensim-wheels --provider S3 --no-enable-cdn + env: + WHEELHOUSE_UPLOADER_USERNAME: ${{ secrets.AWS_ACCESS_KEY_ID }} + WHEELHOUSE_UPLOADER_SECRET: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + + # + # The build process for windows is different to that of Linux and MacOS. + # First, we cannot use multibuild (it does not support Windows). + # This means we have to write our own building and testing steps, but in a + # way it's simpler, because we don't need to care about configuring + # multibuild ourselves. + # Second, the syntax to enable virtual environments, etc. is different. + # + build_windows: + timeout-minutes: 35 + runs-on: windows-latest + defaults: + run: + shell: bash + + needs: [linters] + + strategy: + fail-fast: false + matrix: + include: + - python-version: "3.8" + build-depends: numpy==1.17.3 + + - python-version: "3.9" + build-depends: numpy==1.19.3 + + - python-version: "3.10" + build-depends: numpy==1.22.2 scipy==1.8.0 + + - python-version: "3.11" + build-depends: numpy==1.23.2 scipy==1.9.2 + + env: + SKIP_NETWORK_TESTS: 1 + TEST_DEPENDS: pytest mock testfixtures + BUILD_DEPENDS: ${{ matrix.build-depends }} + + steps: + - uses: actions/checkout@v3 + with: + submodules: recursive + fetch-depth: 0 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install virtualenv + + - name: Build Wheel run: | echo ::group::Set up dependencies python --version python -c "import struct; print(struct.calcsize('P') * 8)" - python -m pip install -U pip setuptools wheel wheelhouse_uploader ${{ env.BUILD_DEPENDS }} echo ::endgroup:: - echo ::group::Build wheel python setup.py bdist_wheel echo ::endgroup - echo ::group::Install run ls dist python continuous_integration/install_wheel.py echo ::endgroup:: - # - # For consistency with the multibuild step. + # For consistency with the multibuild step. The wheel uploader expects + # the wheels to be under wheelhouse. # mv dist wheelhouse @@ -230,34 +302,14 @@ jobs: find . -type f -name "*.egg" -exec rm -v {} \; python -m venv test_environment - # - # Multibuild has a test step but it essentially just installs the wheel - # and runs the test, and requires a lot of magic to get it working. - # It also does not work under Windows. - # So, we create our own simple test step here. - # - - name: Install and Test Wheel (Linux, MacOS) - if: matrix.os != 'windows-latest' - run: | - . test_environment/bin/activate - pip install pytest testfixtures mock - pip install wheelhouse/*.whl - cd test_environment - python -c 'import gensim;print(gensim.__version__)' - # - # This part relies on the wheel containing tests and required data. - # If we remove that from the wheel, we'll need to rewrite this step. - # - pytest -rfxEXs --durations=20 --disable-warnings --showlocals --pyargs gensim - # # We need a separate testing step for windows because the command for # activating the virtual environment is slightly different # - name: Install and Test Wheel (Windows) - if: matrix.os == 'windows-latest' run: | test_environment/Scripts/activate.bat + python -m pip install --upgrade pip pip install pytest testfixtures mock pip install wheelhouse/*.whl cd test_environment @@ -276,6 +328,9 @@ jobs: # if: ${{ always() && env.WHEELHOUSE_UPLOADER_USERNAME && env.WHEELHOUSE_UPLOADER_SECRET }} run: | - pip install wheelhouse-uploader + python -m pip install wheelhouse-uploader ls wheelhouse/*.whl python -m wheelhouse_uploader upload --local-folder wheelhouse/ --no-ssl-check gensim-wheels --provider S3 --no-enable-cdn + env: + WHEELHOUSE_UPLOADER_USERNAME: ${{ secrets.AWS_ACCESS_KEY_ID }} + WHEELHOUSE_UPLOADER_SECRET: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3cb54fe8be..9d52759538 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -9,10 +9,10 @@ jobs: linters: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup up Python ${{ matrix.python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} @@ -47,9 +47,9 @@ jobs: needs: [linters] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup up Python ${{ matrix.python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: # # We use Py3.8 here for historical reasons. @@ -65,7 +65,6 @@ jobs: sudo apt-get -yq remove texlive-binaries --purge sudo apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended latexmk sudo apt-get -yq install build-essential python3.8-dev - - name: Install gensim and its dependencies run: pip install -e .[docs] @@ -73,7 +72,6 @@ jobs: run: | python setup.py build_ext --inplace make -C docs/src clean html - # # FIXME: do we want to store the built documentation somewhere, or is # knowing that the docs built successfully enough? @@ -90,15 +88,15 @@ jobs: fail-fast: false matrix: include: - - {python: 3.7, os: ubuntu-20.04} - {python: 3.8, os: ubuntu-20.04} - {python: 3.9, os: ubuntu-20.04} - {python: '3.10', os: ubuntu-20.04} + - {python: '3.11', os: ubuntu-20.04} - - {python: 3.7, os: windows-2019} - {python: 3.8, os: windows-2019} - {python: 3.9, os: windows-2019} - {python: '3.10', os: windows-2019} + - {python: '3.11', os: windows-2019} # # Don't run this job unless the linters have succeeded. @@ -108,9 +106,9 @@ jobs: needs: [linters] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup up Python ${{ matrix.python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} - name: Update pip @@ -129,14 +127,12 @@ jobs: curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add sudo apt-get update -y sudo apt-get install -y sbt - - name: Install GDB & enable core dumps if: matrix.os == 'ubuntu-20.04' run: | sudo apt-get update -y sudo apt-get install -y gdb ulimit -c unlimited -S # enable core dumps - - name: Install gensim and its dependencies if: matrix.os != 'windows' run: pip install -e .[test] @@ -150,7 +146,6 @@ jobs: python --version pip --version python setup.py build_ext --inplace - # # Some of our tests are hanging, and I strongly suspect it's because of the coverage plugin. # diff --git a/.gitignore b/.gitignore index 019e1812f7..8853bd683a 100644 --- a/.gitignore +++ b/.gitignore @@ -42,7 +42,6 @@ Thumbs.db # Other # ######### -.tox/ .cache/ .project .pydevproject diff --git a/.gitmodules b/.gitmodules index 347fe93043..52a1b1716c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "multibuild"] path = multibuild - url = https://github.com/matthew-brett/multibuild.git + url = https://github.com/multi-build/multibuild diff --git a/.travis.yml b/.travis.yml index f1c9f05e99..8937c0c74c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,25 +26,13 @@ env: # them here for now. They'll get picked up by the multibuild stuff # running in multibuild/common_utils.sh. # - - TEST_DEPENDS="pytest mock cython nmslib pyemd testfixtures Morfessor==2.0.2a4 python-levenshtein==0.12.0 visdom==0.1.8.9 scikit-learn" + - TEST_DEPENDS="pytest mock cython nmslib POT testfixtures python-levenshtein==0.12.0 visdom==0.1.8.9 scikit-learn" matrix: # # See .github/workflows/build-wheels.yml for a discussion of why we # handle numpy versions explicitly. # - - os: linux - env: - - MB_PYTHON_VERSION=3.6 - # - # scipy 1.7.0 wheels not available for Py3.6, so we have to build using - # an older version. - # - - BUILD_DEPENDS="numpy==1.19.2 scipy==1.5.3" - - os: linux - env: - - MB_PYTHON_VERSION=3.7 - - BUILD_DEPENDS="numpy==1.19.2 scipy==1.7.0" - os: linux env: - MB_PYTHON_VERSION=3.8 @@ -58,6 +46,14 @@ matrix: # this numpy release are available via PyPI. # - BUILD_DEPENDS="numpy==1.19.3 scipy==1.7.0" + - os: linux + env: + - MB_PYTHON_VERSION=3.10 + - BUILD_DEPENDS="numpy==1.19.3 scipy==1.7.0" + - os: linux + env: + - MB_PYTHON_VERSION=3.11 + - BUILD_DEPENDS="numpy==1.19.3 scipy==1.7.0" before_install: - source multibuild/common_utils.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 9718b90b64..a1fb190013 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,45 @@ Changes ======= -## Unreleased +## 4.3.0, 2022-12-17 +### :star2: New Features + +* Added support for Python 3.11 and drop support for Python 3.7 (__[acul3](https://github.com/acul3)__, [#3402](https://github.com/RaRe-Technologies/gensim/pull/3402)) +* Added a new model: Flsamodel (__[ERijck](https://github.com/ERijck)__, [#3398](https://github.com/RaRe-Technologies/gensim/pull/3398)) + +### :red_circle: Bug fixes + +* Fixed bug in loss computation for Word2Vec with hierarchical softmax (__[TalIfargan](https://github.com/TalIfargan)__, [#3397](https://github.com/RaRe-Technologies/gensim/pull/3397)) +* Patch Coherence Model to correctly handle empty documents (__[PrimozGodec](https://github.com/PrimozGodec)__, [#3406](https://github.com/RaRe-Technologies/gensim/pull/3406)) +* Fixed bug that prevents loading old models (__[funasshi](https://github.com/funasshi)__, [#3359](https://github.com/RaRe-Technologies/gensim/pull/3359)) +* Fixed deprecation warning from pytest (__[martino-vic](https://github.com/martino-vic)__, [#3354](https://github.com/RaRe-Technologies/gensim/pull/3354)) +* Fixed FastTextKeyedVectors handling in add_vector (__[globba](https://github.com/globba)__, [#3389](https://github.com/RaRe-Technologies/gensim/pull/3389)) +* Fixed typo in word2vec and KeyedVectors docstrings (__[dymil](https://github.com/dymil)__, [#3365](https://github.com/RaRe-Technologies/gensim/pull/3365)) +* Fix backwards compatibility bug in Word2Vec, (**[@mpenkov](https://github.com/mpenkov)**, [#3415](https://github.com/RaRe-Technologies/gensim/pull/3415)) +* Fix numpy hack in setup.py, by (**[@mpenkov](https://github.com/mpenkov)**, [#3416](https://github.com/RaRe-Technologies/gensim/pull/3416)) + +### :books: Tutorial and doc improvements + +* Clarified runtime expectations (__[gojomo](https://github.com/gojomo)__, [#3381](https://github.com/RaRe-Technologies/gensim/pull/3381)) +* Copyedit and fix outdated statements in translation matrix tutorial (__[dymil](https://github.com/dymil)__, [#3375](https://github.com/RaRe-Technologies/gensim/pull/3375)) +* Disabled the Gensim 3=>4 warning in docs (__[piskvorky](https://github.com/piskvorky)__, [#3346](https://github.com/RaRe-Technologies/gensim/pull/3346)) +* Fixed the broken link in readme.md (__[aswin2108](https://github.com/aswin2108)__, [#3409](https://github.com/RaRe-Technologies/gensim/pull/3409)) +* Giving missing credit in EnsembleLDA to Alex in docs (__[sezanzeb](https://github.com/sezanzeb)__, [#3393](https://github.com/RaRe-Technologies/gensim/pull/3393)) + +### :+1: Improvements + +* Switched to Cython language level 3 (__[pabs3](https://github.com/pabs3)__, [#3344](https://github.com/RaRe-Technologies/gensim/pull/3344)) +* Declare variables prior to for loop in fastss.pyx for ANSI C compatibility (__[hstk30](https://github.com/hstk30)__, [#3378](https://github.com/RaRe-Technologies/gensim/pull/3378)) +* Implement numpy hack in setup.py to enable install under Poetry (__[jaymegordo](https://github.com/jaymegordo)__, [#3363](https://github.com/RaRe-Technologies/gensim/pull/3363)) +* Replaceed np.multiply with np.square and copyedit in translation_matrix.py (__[dymil](https://github.com/dymil)__, [#3374](https://github.com/RaRe-Technologies/gensim/pull/3374)) + +### 🔮 Testing, CI, housekeeping + +* Clean up references to `Morfessor`, `tox` and `gensim.models.wrappers` (__[pabs3](https://github.com/pabs3)__, [#3345](https://github.com/RaRe-Technologies/gensim/pull/3345)) +* Pinned sphinx versions, add explicit gallery_top label (__[mpenkov](https://github.com/mpenkov)__, [#3383](https://github.com/RaRe-Technologies/gensim/pull/3383)) +* Updated Python module MANIFEST (__[pabs3](https://github.com/pabs3)__, [#3343](https://github.com/RaRe-Technologies/gensim/pull/3343)) +* Refactored wheel building and testing workflow (__[mpenkov](https://github.com/mpenkov)__, [#3410](https://github.com/RaRe-Technologies/gensim/pull/3410)) ## 4.2.0, 2022-04-29 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0eeb90591b..09f2f5a870 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,11 +20,9 @@ Also, please check the [Gensim FAQ](https://github.com/RaRe-Technologies/gensim/ - For windows: `pip install -e .[test-win]` 5. Implement your changes 6. Check that everything's OK in your branch: - - Check it for PEP8: `tox -e flake8` - - Build its documentation (works only for MacOS/Linux): `tox -e docs` (documentation stored in `docs/src/_build`) - - Run unit tests: `tox -e py{version}-{os}`, for example `tox -e py35-linux` or `tox -e py36-win` where - - `{version}` is one of `35`, `36` - - `{os}` is either `win` or `linux` + - Check it for PEP8: `flake8 --ignore E12,W503 --max-line-length 120 --show-source gensim` + - Build its documentation (works only for MacOS/Linux): `make -C docs/src html` (documentation stored in `docs/src/_build`) + - Run unit tests: `pytest -v gensim/test` 7. Add files, commit and push: `git add ... ; git commit -m "my commit message"; git push origin my-feature` 8. [Create a PR](https://help.github.com/articles/creating-a-pull-request/) on Github. Write a **clear description** for your PR, including all the context and relevant information, such as: - The issue that you fixed, e.g. `Fixes #123` diff --git a/MANIFEST.in b/MANIFEST.in index 8aa14d25b8..cc4323533f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,8 +2,6 @@ recursive-include gensim/test/test_data * include README.md include CHANGELOG.md include COPYING -include COPYING.LESSER -include ez_setup.py include gensim/models/voidptr.h include gensim/models/stdint_wrapper.h @@ -16,7 +14,7 @@ include gensim/models/word2vec_corpusfile.cpp include gensim/models/word2vec_corpusfile.pyx include gensim/models/word2vec_corpusfile.pxd -include gensim/models/doc2vec_inner.c +include gensim/models/doc2vec_inner.cpp include gensim/models/doc2vec_inner.pyx include gensim/models/doc2vec_inner.pxd include gensim/models/doc2vec_corpusfile.cpp diff --git a/README.md b/README.md index f1cb9f3ddd..b4bd542f65 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ Adopters | [Tailwind](https://www.tailwindapp.com/) | ![tailwind](docs/src/readme_images/tailwind.png) | Media | Post interesting and relevant content to Pinterest. | | [Issuu](https://issuu.com/) | ![issuu](docs/src/readme_images/issuu.png) | Media | Gensim's LDA module lies at the very core of the analysis we perform on each uploaded publication to figure out what it's all about. | | [Search Metrics](http://www.searchmetrics.com/) | ![search-metrics](docs/src/readme_images/search-metrics.png) | Content Marketing | Gensim word2vec used for entity disambiguation in Search Engine Optimisation. | -| [12K Research](https://12k.co/) | ![12k](docs/src/readme_images/12k.png)| Media | Document similarity analysis on media articles. | +| [12K Research](https://12k.com/) | ![12k](docs/src/readme_images/12k.png)| Media | Document similarity analysis on media articles. | | [Stillwater Supercomputing](http://www.stillwater-sc.com/) | ![stillwater](docs/src/readme_images/stillwater.png) | Hardware | Document comprehension and association with word2vec. | | [SiteGround](https://www.siteground.com/) | ![siteground](docs/src/readme_images/siteground.png) | Web hosting | An ensemble search engine which uses different embeddings models and similarities, including word2vec, WMD, and LDA. | | [Capital One](https://www.capitalone.com/) | ![capitalone](docs/src/readme_images/capitalone.png) | Finance | Topic modeling for customer complaints exploration. | diff --git a/docs/notebooks/WMD_tutorial.ipynb b/docs/notebooks/WMD_tutorial.ipynb index ff1f608dc5..9b051104d5 100644 --- a/docs/notebooks/WMD_tutorial.ipynb +++ b/docs/notebooks/WMD_tutorial.ipynb @@ -30,7 +30,7 @@ "\n", "## Running this notebook\n", "\n", - "You can download this [iPython Notebook](http://ipython.org/notebook.html), and run it on your own computer, provided you have installed Gensim, PyEMD, NLTK, and downloaded the necessary data.\n", + "You can download this [iPython Notebook](http://ipython.org/notebook.html), and run it on your own computer, provided you have installed Gensim, POT, NLTK, and downloaded the necessary data.\n", "\n", "The notebook was run on an Ubuntu machine with an Intel core i7-4770 CPU 3.40GHz (8 cores) and 32 GB memory. Running the entire notebook on this machine takes about 3 minutes.\n", "\n", @@ -524,8 +524,7 @@ "source": [ "## References\n", "\n", - "1. Ofir Pele and Michael Werman, *A linear time histogram metric for improved SIFT matching*, 2008.\n", - "* Ofir Pele and Michael Werman, *Fast and robust earth mover's distances*, 2009.\n", + "1. * Rémi Flamary et al. *POT: Python Optimal Transport*, 2021.\n", "* Matt Kusner et al. *From Embeddings To Document Distances*, 2015.\n", "* Thomas Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013." ] diff --git a/docs/notebooks/soft_cosine_tutorial.ipynb b/docs/notebooks/soft_cosine_tutorial.ipynb index 4c7fceb1df..a8d1c41555 100644 --- a/docs/notebooks/soft_cosine_tutorial.ipynb +++ b/docs/notebooks/soft_cosine_tutorial.ipynb @@ -30,7 +30,7 @@ ">\n", "\n", "## Running this notebook\n", - "You can download this [Jupyter notebook](http://jupyter.org/), and run it on your own computer, provided you have installed the `gensim`, `jupyter`, `sklearn`, `pyemd`, and `wmd` Python packages.\n", + "You can download this [Jupyter notebook](http://jupyter.org/), and run it on your own computer, provided you have installed the `gensim`, `jupyter`, `sklearn`, `POT`, and `wmd` Python packages.\n", "\n", "The notebook was run on an Ubuntu machine with an Intel core i7-6700HQ CPU 3.10GHz (4 cores) and 16 GB memory. Assuming all resources required by the notebook have already been downloaded, running the entire notebook on this machine takes about 30 minutes." ] @@ -357,7 +357,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install pyemd" + "!pip install POT" ] }, { @@ -404,7 +404,7 @@ " return similarities\n", "\n", "def wmd_gensim(query, documents):\n", - " # Compute Word Mover's Distance as implemented in PyEMD by William Mayner\n", + " # Compute Word Mover's Distance as implemented in POT\n", " # between the query and the documents.\n", " index = WmdSimilarity(documents, w2v_model)\n", " similarities = index[query]\n", @@ -532,26 +532,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Dataset | Strategy | MAP score | Elapsed time (sec)\n", - ":---|:---|:---|---:\n", - "2016-test|softcossim|78.52 ±11.18|6.00 ±0.79\n", - "2016-test|**Winner (UH-PRHLT-primary)**|76.70 ±0.00|\n", - "2016-test|cossim|76.45 ±10.40|0.64 ±0.08\n", - "2016-test|wmd-gensim|76.23 ±11.42|5.37 ±0.64\n", - "2016-test|**Baseline 1 (IR)**|74.75 ±0.00|\n", - "2016-test|wmd-relax|71.05 ±11.06|1.11 ±0.09\n", - "2016-test|**Baseline 2 (random)**|46.98 ±0.00|\n", - "\n", - "\n", - "Dataset | Strategy | MAP score | Elapsed time (sec)\n", - ":---|:---|:---|---:\n", - "2017-test|**Winner (SimBow-primary)**|47.22 ±0.00|\n", - "2017-test|softcossim|45.88 ±16.22|7.08 ±1.49\n", - "2017-test|cossim|44.38 ±14.71|0.74 ±0.10\n", - "2017-test|wmd-gensim|44.06 ±15.92|6.20 ±0.87\n", - "2017-test|wmd-relax|43.52 ±16.30|1.30 ±0.18\n", - "2017-test|**Baseline 1 (IR)**|41.85 ±0.00|\n", - "2017-test|**Baseline 2 (random)**|29.81 ±0.00|" + "Dataset | Strategy | MAP score | Elapsed time (sec)\n", + ":---|:---|:---|---:\n", + "2016-test|softcossim|78.52 ±11.18|6.00 ±0.79\n", + "2016-test|**Winner (UH-PRHLT-primary)**|76.70 ±0.00|\n", + "2016-test|cossim|76.45 ±10.40|0.64 ±0.08\n", + "2016-test|wmd-gensim|76.23 ±11.42|5.37 ±0.64\n", + "2016-test|**Baseline 1 (IR)**|74.75 ±0.00|\n", + "2016-test|wmd-relax|71.05 ±11.06|1.11 ±0.09\n", + "2016-test|**Baseline 2 (random)**|46.98 ±0.00|\n", + "\n", + "\n", + "Dataset | Strategy | MAP score | Elapsed time (sec)\n", + ":---|:---|:---|---:\n", + "2017-test|**Winner (SimBow-primary)**|47.22 ±0.00|\n", + "2017-test|softcossim|45.88 ±16.22|7.08 ±1.49\n", + "2017-test|cossim|44.38 ±14.71|0.74 ±0.10\n", + "2017-test|wmd-gensim|44.06 ±15.92|6.20 ±0.87\n", + "2017-test|wmd-relax|43.52 ±16.30|1.30 ±0.18\n", + "2017-test|**Baseline 1 (IR)**|41.85 ±0.00|\n", + "2017-test|**Baseline 2 (random)**|29.81 ±0.00|" ] }, { diff --git a/docs/notebooks/translation_matrix.ipynb b/docs/notebooks/translation_matrix.ipynb index 8832f732e6..8e7eefdbbd 100644 --- a/docs/notebooks/translation_matrix.ipynb +++ b/docs/notebooks/translation_matrix.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Tranlation Matrix Tutorial" + "# Translation Matrix Tutorial" ] }, { @@ -34,14 +34,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Tomas Mikolov, Quoc V Le, Ilya Sutskever. 2013.[Exploiting Similarities among Languages for Machine Translation](https://arxiv.org/pdf/1309.4168.pdf)\n", + "Tomas Mikolov, Quoc V Le, Ilya Sutskever. 2013. [Exploiting Similarities among Languages for Machine Translation](https://arxiv.org/pdf/1309.4168.pdf)\n", "\n", - "Georgiana Dinu, Angelikie Lazaridou and Marco Baroni. 2014.[Improving zero-shot learning by mitigating the hubness problem](https://arxiv.org/pdf/1309.4168.pdf)" + "Georgiana Dinu, Angelikie Lazaridou and Marco Baroni. 2014. [Improving zero-shot learning by mitigating the hubness problem](https://arxiv.org/pdf/1309.4168.pdf)" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -49,7 +49,8 @@ "\n", "from gensim import utils\n", "from gensim.models import translation_matrix\n", - "from gensim.models import KeyedVectors" + "from gensim.models import KeyedVectors\n", + "import smart_open" ] }, { @@ -65,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -74,30 +75,23 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": {}, "outputs": [ { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: 'OPUS_en_it_europarl_train_5K.txt'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mtrain_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"OPUS_en_it_europarl_train_5K.txt\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msmart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mword_pair\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mtuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_unicode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mword_pair\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/smart_open/smart_open_lib.py\u001b[0m in \u001b[0;36msmart_open\u001b[0;34m(uri, mode, **kw)\u001b[0m\n\u001b[1;32m 437\u001b[0m \u001b[0mtransport_params\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 439\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muri\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_ext\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mignore_extension\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransport_params\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransport_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mscrubbed_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 440\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/smart_open/smart_open_lib.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(uri, mode, buffering, encoding, errors, newline, closefd, opener, ignore_ext, transport_params)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[0mbuffering\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbuffering\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 306\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 307\u001b[0;31m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 308\u001b[0m )\n\u001b[1;32m 309\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfobj\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/smart_open/smart_open_lib.py\u001b[0m in \u001b[0;36m_shortcut_open\u001b[0;34m(uri, mode, ignore_ext, buffering, encoding, errors)\u001b[0m\n\u001b[1;32m 496\u001b[0m \u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 497\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msix\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPY3\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 498\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_builtin_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muri_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffering\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbuffering\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mopen_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 499\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mopen_kwargs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 500\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_builtin_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muri_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffering\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbuffering\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'OPUS_en_it_europarl_train_5K.txt'" + "name": "stdout", + "output_type": "stream", + "text": [ + "[('for', 'per'), ('that', 'che'), ('with', 'con'), ('are', 'are'), ('are', 'sono'), ('this', 'questa'), ('this', 'questo'), ('you', 'lei'), ('not', 'non'), ('which', 'che')]\n" ] } ], "source": [ "train_file = \"OPUS_en_it_europarl_train_5K.txt\"\n", "\n", - "with utils.smart_open(train_file, \"r\") as f:\n", + "with smart_open.open(train_file, \"r\") as f:\n", " word_pair = [tuple(utils.to_unicode(line).strip().split()) for line in f]\n", - "print (word_pair[:10])" + "print(word_pair[:10])" ] }, { @@ -151,14 +145,14 @@ "source": [ "transmat = translation_matrix.TranslationMatrix(source_word_vec, target_word_vec, word_pair)\n", "transmat.train(word_pair)\n", - "print (\"the shape of translation matrix is: \", transmat.translation_matrix.shape)" + "print(\"the shape of translation matrix is: \", transmat.translation_matrix.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Prediction Time: For any given new word, we can map it to the other language space by coputing $z = Wx$, then we find the word whose representation is closet to z in the target language space, using consine similarity as the distance metric." + "Prediction Time: For any given new word, we can map it to the other language space by computing $z = Wx$, then we find the word whose representation is closet to z in the target language space, using cosine similarity as the distance metric." ] }, { @@ -190,7 +184,7 @@ "outputs": [], "source": [ "for k, v in translated_word.iteritems():\n", - " print (\"word \", k, \" and translated word\", v)" + " print(\"word \", k, \" and translated word\", v)" ] }, { @@ -211,7 +205,7 @@ "source_word, target_word = zip(*words)\n", "translated_word = transmat.translate(source_word, 5)\n", "for k, v in translated_word.iteritems():\n", - " print (\"word \", k, \" and translated word\", v)" + " print(\"word \", k, \" and translated word\", v)" ] }, { @@ -232,7 +226,7 @@ "source_word, target_word = zip(*words)\n", "translated_word = transmat.translate(source_word, 5)\n", "for k, v in translated_word.iteritems():\n", - " print (\"word \", k, \" and translated word\", v)" + " print(\"word \", k, \" and translated word\", v)" ] }, { @@ -246,7 +240,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Testing the creation time, we extracted more word pairs from a dictionary built from Europarl([Europara, en-it](http://opus.lingfil.uu.se/)). We obtain about 20K word pairs and their coresponding word vectors or you can download from this.[word_dict.pkl](https://pan.baidu.com/s/1dF8HUX7)" + "Testing the creation time, we extracted more word pairs from a dictionary built from Europarl([Europara, en-it](http://opus.lingfil.uu.se/)). We obtain about 20K word pairs and their corresponding word vectors or you can download from this: [word_dict.pkl](https://pan.baidu.com/s/1dF8HUX7)" ] }, { @@ -257,9 +251,9 @@ "source": [ "import pickle\n", "word_dict = \"word_dict.pkl\"\n", - "with utils.smart_open(word_dict, \"r\") as f:\n", + "with smart_open.open(word_dict, \"r\") as f:\n", " word_pair = pickle.load(f)\n", - "print (\"the length of word pair \", len(word_pair))" + "print(\"the length of word pair \", len(word_pair))" ] }, { @@ -423,7 +417,7 @@ "\n", "# Translate the English word five to Italian word\n", "translated_word = transmat.translate([en_words[4]], 3)\n", - "print \"translation of five: \", translated_word\n", + "print(\"translation of five: \", translated_word)\n", "\n", "# the translated words of five\n", "for item in translated_word[en_words[4]]:\n", @@ -518,7 +512,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's see some animal words, the figue shows that most of words are also share the similar geometric arrangements." + "Let's see some animal words, the figure shows that most of the words also have similar geometric arrangements." ] }, { @@ -593,7 +587,7 @@ "\n", "# Translate the English word birds to Italian word\n", "translated_word = transmat.translate([en_words[4]], 3)\n", - "print \"translation of birds: \", translated_word\n", + "print(\"translation of birds: \", translated_word)\n", "\n", "# the translated words of birds\n", "for item in translated_word[en_words[4]]:\n", @@ -700,7 +694,7 @@ "source": [ "As dicussion in this [PR](https://github.com/RaRe-Technologies/gensim/pull/1434), Translation Matrix not only can used to translate the words from one source language to another target lanuage, but also to translate new document vectors back to old model space.\n", "\n", - "For example, if we have trained 15k documents using doc2vec (we called this as model1), and we are going to train new 35k documents using doc2vec(we called this as model2). So we can include those 15k documents as reference documents into the new 35k documents. Then we can get 15k document vectors from model1 and 50k document vectors from model2, but both of the two models have vectors for those 15k documents. We can use those vectors to build a mapping from model1 to model2. Finally, with this relation, we can back-mapping the model2's vector to model1. Therefore, 35k document vectors are learned using this method." + "For example, if we have trained 15k documents using doc2vec (we called this as model1), and we are going to train new 35k documents using doc2vec (we called this as model2). So we can include those 15k documents as reference documents into the new 35k documents. Then we can get 15k document vectors from model1 and 50k document vectors from model2, but both of the two models have vectors for those 15k documents. We can use those vectors to build a mapping from model1 to model2. Finally, with this relation, we can back-map the model2's vector to model1. Therefore, 35k document vectors are learned using this method." ] }, { @@ -720,13 +714,13 @@ "from gensim.models.doc2vec import TaggedDocument\n", "from gensim.models import Doc2Vec\n", "from collections import namedtuple\n", - "from gensim import utils\n", + "import smart_open\n", "\n", "def read_sentimentDocs():\n", " SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')\n", "\n", " alldocs = [] # will hold all docs in original order\n", - " with utils.smart_open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:\n", + " with smart_open.open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:\n", " for line_no, line in enumerate(alldata):\n", " tokens = gensim.utils.to_unicode(line).split()\n", " words = tokens[1:]\n", @@ -748,14 +742,14 @@ "small_corpus = train_docs[:15000]\n", "large_corpus = train_docs + test_docs\n", "\n", - "print len(train_docs), len(test_docs), len(doc_list), len(small_corpus), len(large_corpus)" + "print(len(train_docs), len(test_docs), len(doc_list), len(small_corpus), len(large_corpus))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Here, we train two Doc2vec model, the parameters can be determined by yourself. We trained on 15k documents for the `model1` and 50k documents for the `model2`. But you should mixed some documents which from the 15k document in `model` to the `model2` as dicussed before. " + "Here, we train two Doc2vec model, the parameters can be determined by yourself. We trained on 15k documents for the `model1` and 50k documents for the `model2`. But you should mix some documents which from the 15k document in `model` to the `model2`, as discussed before. " ] }, { @@ -795,7 +789,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For the IMDB training dataset, we train an classifier on the train data which has 25k documents with positive and negative label. Then using this classifier to predict the test data. To see what accuracy can the document vectors which learned by different method achieve." + "For the IMDB training dataset, we train an classifier on the train data which has 25k documents with positive and negative label. Then using this classifier to predict the test data, we see what accuracy can be achieved by the document vectors learned by different methods." ] }, { @@ -812,7 +806,7 @@ " classifier = LogisticRegression()\n", " classifier.fit(train, train_label)\n", " score = classifier.score(test, test_label)\n", - " print \"the classifier score :\", score\n", + " print(\"the classifier score :\", score)\n", " return score" ] }, @@ -855,7 +849,7 @@ " test_array[i + 12500] = m2[i + 37500]\n", " test_label[i + 12500] = 0\n", "\n", - "print \"The vectors are learned by doc2vec method\"\n", + "print(\"The vectors are learned by doc2vec method\")\n", "test_classifier_error(train_array, train_label, test_array, test_label)" ] }, @@ -910,7 +904,7 @@ " test_array[i + 12500] = m1[i + 37500]\n", " test_label[i + 12500] = 0\n", "\n", - "print \"The vectors are learned by back-mapping method\"\n", + "print(\"The vectors are learned by back-mapping method\")\n", "test_classifier_error(train_array, train_label, test_array, test_label)" ] }, @@ -918,7 +912,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As we can see that, the vectors learned by back-mapping method performed not bad but still need improved." + "As we can see that, the vectors learned by back-mapping method performed not bad but still need to be improved." ] }, { @@ -1026,18 +1020,11 @@ "source": [ "You probably will see kinds of colors point. One for the `model1`, the `sdoc0` to `sdoc4` document vector are learned by Doc2vec and `sdoc5` and `sdoc6` are learned by back-mapping. One for the `model2`, the `tdoc0` to `tdoc6` are learned by Doc2vec. We can see that some of points learned from the back-mapping method still have the relative position with the point learned by Doc2vec." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.10.2 64-bit", "language": "python", "name": "python3" }, @@ -1051,7 +1038,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.10.2" + }, + "vscode": { + "interpreter": { + "hash": "901b79e026e03396fd1ffa7133844e9ea80e258ce34c66e1aabb5896bcb18463" + } } }, "nbformat": 4, diff --git a/docs/src/auto_examples/core/images/sphx_glr_run_topics_and_transformations_001.png b/docs/src/auto_examples/core/images/sphx_glr_run_topics_and_transformations_001.png index 3dbf224404..2b3ddc2b4f 100644 Binary files a/docs/src/auto_examples/core/images/sphx_glr_run_topics_and_transformations_001.png and b/docs/src/auto_examples/core/images/sphx_glr_run_topics_and_transformations_001.png differ diff --git a/docs/src/auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png b/docs/src/auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png index eebecf5b78..81f90665d1 100644 Binary files a/docs/src/auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png and b/docs/src/auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png differ diff --git a/docs/src/auto_examples/core/index.rst b/docs/src/auto_examples/core/index.rst new file mode 100644 index 0000000000..03716729a9 --- /dev/null +++ b/docs/src/auto_examples/core/index.rst @@ -0,0 +1,98 @@ + + +.. _sphx_glr_auto_examples_core: + +Core Tutorials: New Users Start Here! +------------------------------------- + +If you're new to gensim, we recommend going through all core tutorials in order. +Understanding this functionality is vital for using gensim effectively. + + + +.. raw:: html + +
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png + :alt: Core Concepts + + :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` + +.. raw:: html + +
Core Concepts
+
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png + :alt: Corpora and Vector Spaces + + :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` + +.. raw:: html + +
Corpora and Vector Spaces
+
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png + :alt: Topics and Transformations + + :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` + +.. raw:: html + +
Topics and Transformations
+
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png + :alt: Similarity Queries + + :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` + +.. raw:: html + +
Similarity Queries
+
+ + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /auto_examples/core/run_core_concepts + /auto_examples/core/run_corpora_and_vector_spaces + /auto_examples/core/run_topics_and_transformations + /auto_examples/core/run_similarity_queries + diff --git a/docs/src/auto_examples/core/run_topics_and_transformations.ipynb b/docs/src/auto_examples/core/run_topics_and_transformations.ipynb index c5a5fbb709..4f40de02be 100644 --- a/docs/src/auto_examples/core/run_topics_and_transformations.ipynb +++ b/docs/src/auto_examples/core/run_topics_and_transformations.ipynb @@ -15,7 +15,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\nTopics and Transformations\n===========================\n\nIntroduces transformations and demonstrates their use on a toy corpus.\n\n" + "\n# Topics and Transformations\n\nIntroduces transformations and demonstrates their use on a toy corpus.\n" ] }, { @@ -33,7 +33,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this tutorial, I will show how to transform documents from one vector representation\ninto another. This process serves two goals:\n\n1. To bring out hidden structure in the corpus, discover relationships between\n words and use them to describe the documents in a new and\n (hopefully) more semantic way.\n2. To make the document representation more compact. This both improves efficiency\n (new representation consumes less resources) and efficacy (marginal data\n trends are ignored, noise-reduction).\n\nCreating the Corpus\n-------------------\n\nFirst, we need to create a corpus to work with.\nThis step is the same as in the previous tutorial;\nif you completed it, feel free to skip to the next section.\n\n" + "In this tutorial, I will show how to transform documents from one vector representation\ninto another. This process serves two goals:\n\n1. To bring out hidden structure in the corpus, discover relationships between\n words and use them to describe the documents in a new and\n (hopefully) more semantic way.\n2. To make the document representation more compact. This both improves efficiency\n (new representation consumes less resources) and efficacy (marginal data\n trends are ignored, noise-reduction).\n\n## Creating the Corpus\n\nFirst, we need to create a corpus to work with.\nThis step is the same as in the previous tutorial;\nif you completed it, feel free to skip to the next section.\n\n" ] }, { @@ -51,7 +51,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Creating a transformation\n++++++++++++++++++++++++++\n\nThe transformations are standard Python objects, typically initialized by means of\na :dfn:`training corpus`:\n\n\n" + "### Creating a transformation\n\nThe transformations are standard Python objects, typically initialized by means of\na :dfn:`training corpus`:\n\n\n" ] }, { @@ -69,7 +69,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We used our old corpus from tutorial 1 to initialize (train) the transformation model. Different\ntransformations may require different initialization parameters; in case of TfIdf, the\n\"training\" consists simply of going through the supplied corpus once and computing document frequencies\nof all its features. Training other models, such as Latent Semantic Analysis or Latent Dirichlet\nAllocation, is much more involved and, consequently, takes much more time.\n\n

Note

Transformations always convert between two specific vector\n spaces. The same vector space (= the same set of feature ids) must be used for training\n as well as for subsequent vector transformations. Failure to use the same input\n feature space, such as applying a different string preprocessing, using different\n feature ids, or using bag-of-words input vectors where TfIdf vectors are expected, will\n result in feature mismatch during transformation calls and consequently in either\n garbage output and/or runtime exceptions.

\n\n\nTransforming vectors\n+++++++++++++++++++++\n\nFrom now on, ``tfidf`` is treated as a read-only object that can be used to convert\nany vector from the old representation (bag-of-words integer counts) to the new representation\n(TfIdf real-valued weights):\n\n" + "We used our old corpus from tutorial 1 to initialize (train) the transformation model. Different\ntransformations may require different initialization parameters; in case of TfIdf, the\n\"training\" consists simply of going through the supplied corpus once and computing document frequencies\nof all its features. Training other models, such as Latent Semantic Analysis or Latent Dirichlet\nAllocation, is much more involved and, consequently, takes much more time.\n\n

Note

Transformations always convert between two specific vector\n spaces. The same vector space (= the same set of feature ids) must be used for training\n as well as for subsequent vector transformations. Failure to use the same input\n feature space, such as applying a different string preprocessing, using different\n feature ids, or using bag-of-words input vectors where TfIdf vectors are expected, will\n result in feature mismatch during transformation calls and consequently in either\n garbage output and/or runtime exceptions.

\n\n\n### Transforming vectors\n\nFrom now on, ``tfidf`` is treated as a read-only object that can be used to convert\nany vector from the old representation (bag-of-words integer counts) to the new representation\n(TfIdf real-valued weights):\n\n" ] }, { @@ -177,7 +177,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The next question might be: just how exactly similar are those documents to each other?\nIs there a way to formalize the similarity, so that for a given input document, we can\norder some other set of documents according to their similarity? Similarity queries\nare covered in the next tutorial (`sphx_glr_auto_examples_core_run_similarity_queries.py`).\n\n\nAvailable transformations\n--------------------------\n\nGensim implements several popular Vector Space Model algorithms:\n\n* `Term Frequency * Inverse Document Frequency, Tf-Idf `_\n expects a bag-of-words (integer values) training corpus during initialization.\n During transformation, it will take a vector and return another vector of the\n same dimensionality, except that features which were rare in the training corpus\n will have their value increased.\n It therefore converts integer-valued vectors into real-valued ones, while leaving\n the number of dimensions intact. It can also optionally normalize the resulting\n vectors to (Euclidean) unit length.\n\n .. sourcecode:: pycon\n\n model = models.TfidfModel(corpus, normalize=True)\n\n* `Latent Semantic Indexing, LSI (or sometimes LSA) `_\n transforms documents from either bag-of-words or (preferrably) TfIdf-weighted space into\n a latent space of a lower dimensionality. For the toy corpus above we used only\n 2 latent dimensions, but on real corpora, target dimensionality of 200--500 is recommended\n as a \"golden standard\" [1]_.\n\n .. sourcecode:: pycon\n\n model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)\n\n LSI training is unique in that we can continue \"training\" at any point, simply\n by providing more training documents. This is done by incremental updates to\n the underlying model, in a process called `online training`. Because of this feature, the\n input document stream may even be infinite -- just keep feeding LSI new documents\n as they arrive, while using the computed transformation model as read-only in the meanwhile!\n\n .. sourcecode:: pycon\n\n model.add_documents(another_tfidf_corpus) # now LSI has been trained on tfidf_corpus + another_tfidf_corpus\n lsi_vec = model[tfidf_vec] # convert some new document into the LSI space, without affecting the model\n\n model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents\n lsi_vec = model[tfidf_vec]\n\n See the :mod:`gensim.models.lsimodel` documentation for details on how to make\n LSI gradually \"forget\" old observations in infinite streams. If you want to get dirty,\n there are also parameters you can tweak that affect speed vs. memory footprint vs. numerical\n precision of the LSI algorithm.\n\n `gensim` uses a novel online incremental streamed distributed training algorithm (quite a mouthful!),\n which I published in [5]_. `gensim` also executes a stochastic multi-pass algorithm\n from Halko et al. [4]_ internally, to accelerate in-core part\n of the computations.\n See also `wiki` for further speed-ups by distributing the computation across\n a cluster of computers.\n\n* `Random Projections, RP `_ aim to\n reduce vector space dimensionality. This is a very efficient (both memory- and\n CPU-friendly) approach to approximating TfIdf distances between documents, by throwing in a little randomness.\n Recommended target dimensionality is again in the hundreds/thousands, depending on your dataset.\n\n .. sourcecode:: pycon\n\n model = models.RpModel(tfidf_corpus, num_topics=500)\n\n* `Latent Dirichlet Allocation, LDA `_\n is yet another transformation from bag-of-words counts into a topic space of lower\n dimensionality. LDA is a probabilistic extension of LSA (also called multinomial PCA),\n so LDA's topics can be interpreted as probability distributions over words. These distributions are,\n just like with LSA, inferred automatically from a training corpus. Documents\n are in turn interpreted as a (soft) mixture of these topics (again, just like with LSA).\n\n .. sourcecode:: pycon\n\n model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)\n\n `gensim` uses a fast implementation of online LDA parameter estimation based on [2]_,\n modified to run in `distributed mode ` on a cluster of computers.\n\n* `Hierarchical Dirichlet Process, HDP `_\n is a non-parametric bayesian method (note the missing number of requested topics):\n\n .. sourcecode:: pycon\n\n model = models.HdpModel(corpus, id2word=dictionary)\n\n `gensim` uses a fast, online implementation based on [3]_.\n The HDP model is a new addition to `gensim`, and still rough around its academic edges -- use with care.\n\nAdding new :abbr:`VSM (Vector Space Model)` transformations (such as different weighting schemes) is rather trivial;\nsee the `apiref` or directly the `Python code `_\nfor more info and examples.\n\nIt is worth repeating that these are all unique, **incremental** implementations,\nwhich do not require the whole training corpus to be present in main memory all at once.\nWith memory taken care of, I am now improving `distributed`,\nto improve CPU efficiency, too.\nIf you feel you could contribute by testing, providing use-cases or code, see the `Gensim Developer guide `__.\n\nWhat Next?\n----------\n\nContinue on to the next tutorial on `sphx_glr_auto_examples_core_run_similarity_queries.py`.\n\nReferences\n----------\n\n.. [1] Bradford. 2008. An empirical study of required dimensionality for large-scale latent semantic indexing applications.\n\n.. [2] Hoffman, Blei, Bach. 2010. Online learning for Latent Dirichlet Allocation.\n\n.. [3] Wang, Paisley, Blei. 2011. Online variational inference for the hierarchical Dirichlet process.\n\n.. [4] Halko, Martinsson, Tropp. 2009. Finding structure with randomness.\n\n.. [5] \u0158eh\u016f\u0159ek. 2011. Subspace tracking for Latent Semantic Analysis.\n\n" + "The next question might be: just how exactly similar are those documents to each other?\nIs there a way to formalize the similarity, so that for a given input document, we can\norder some other set of documents according to their similarity? Similarity queries\nare covered in the next tutorial (`sphx_glr_auto_examples_core_run_similarity_queries.py`).\n\n\n## Available transformations\n\nGensim implements several popular Vector Space Model algorithms:\n\n* `Term Frequency * Inverse Document Frequency, Tf-Idf `_\n expects a bag-of-words (integer values) training corpus during initialization.\n During transformation, it will take a vector and return another vector of the\n same dimensionality, except that features which were rare in the training corpus\n will have their value increased.\n It therefore converts integer-valued vectors into real-valued ones, while leaving\n the number of dimensions intact. It can also optionally normalize the resulting\n vectors to (Euclidean) unit length.\n\n .. sourcecode:: pycon\n\n model = models.TfidfModel(corpus, normalize=True)\n\n* `Okapi Best Matching, Okapi BM25 `_\n expects a bag-of-words (integer values) training corpus during initialization.\n During transformation, it will take a vector and return another vector of the\n same dimensionality, except that features which were rare in the training corpus\n will have their value increased. It therefore converts integer-valued\n vectors into real-valued ones, while leaving the number of dimensions intact.\n\n Okapi BM25 is the standard ranking function used by search engines to estimate\n the relevance of documents to a given search query.\n\n .. sourcecode:: pycon\n\n model = models.OkapiBM25Model(corpus)\n\n* `Latent Semantic Indexing, LSI (or sometimes LSA) `_\n transforms documents from either bag-of-words or (preferrably) TfIdf-weighted space into\n a latent space of a lower dimensionality. For the toy corpus above we used only\n 2 latent dimensions, but on real corpora, target dimensionality of 200--500 is recommended\n as a \"golden standard\" [1]_.\n\n .. sourcecode:: pycon\n\n model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)\n\n LSI training is unique in that we can continue \"training\" at any point, simply\n by providing more training documents. This is done by incremental updates to\n the underlying model, in a process called `online training`. Because of this feature, the\n input document stream may even be infinite -- just keep feeding LSI new documents\n as they arrive, while using the computed transformation model as read-only in the meanwhile!\n\n .. sourcecode:: pycon\n\n model.add_documents(another_tfidf_corpus) # now LSI has been trained on tfidf_corpus + another_tfidf_corpus\n lsi_vec = model[tfidf_vec] # convert some new document into the LSI space, without affecting the model\n\n model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents\n lsi_vec = model[tfidf_vec]\n\n See the :mod:`gensim.models.lsimodel` documentation for details on how to make\n LSI gradually \"forget\" old observations in infinite streams. If you want to get dirty,\n there are also parameters you can tweak that affect speed vs. memory footprint vs. numerical\n precision of the LSI algorithm.\n\n `gensim` uses a novel online incremental streamed distributed training algorithm (quite a mouthful!),\n which I published in [5]_. `gensim` also executes a stochastic multi-pass algorithm\n from Halko et al. [4]_ internally, to accelerate in-core part\n of the computations.\n See also `wiki` for further speed-ups by distributing the computation across\n a cluster of computers.\n\n* `Random Projections, RP `_ aim to\n reduce vector space dimensionality. This is a very efficient (both memory- and\n CPU-friendly) approach to approximating TfIdf distances between documents, by throwing in a little randomness.\n Recommended target dimensionality is again in the hundreds/thousands, depending on your dataset.\n\n .. sourcecode:: pycon\n\n model = models.RpModel(tfidf_corpus, num_topics=500)\n\n* `Latent Dirichlet Allocation, LDA `_\n is yet another transformation from bag-of-words counts into a topic space of lower\n dimensionality. LDA is a probabilistic extension of LSA (also called multinomial PCA),\n so LDA's topics can be interpreted as probability distributions over words. These distributions are,\n just like with LSA, inferred automatically from a training corpus. Documents\n are in turn interpreted as a (soft) mixture of these topics (again, just like with LSA).\n\n .. sourcecode:: pycon\n\n model = models.LdaModel(corpus, id2word=dictionary, num_topics=100)\n\n `gensim` uses a fast implementation of online LDA parameter estimation based on [2]_,\n modified to run in `distributed mode ` on a cluster of computers.\n\n* `Hierarchical Dirichlet Process, HDP `_\n is a non-parametric bayesian method (note the missing number of requested topics):\n\n .. sourcecode:: pycon\n\n model = models.HdpModel(corpus, id2word=dictionary)\n\n `gensim` uses a fast, online implementation based on [3]_.\n The HDP model is a new addition to `gensim`, and still rough around its academic edges -- use with care.\n\nAdding new :abbr:`VSM (Vector Space Model)` transformations (such as different weighting schemes) is rather trivial;\nsee the `apiref` or directly the `Python code `_\nfor more info and examples.\n\nIt is worth repeating that these are all unique, **incremental** implementations,\nwhich do not require the whole training corpus to be present in main memory all at once.\nWith memory taken care of, I am now improving `distributed`,\nto improve CPU efficiency, too.\nIf you feel you could contribute by testing, providing use-cases or code, see the `Gensim Developer guide `__.\n\n## What Next?\n\nContinue on to the next tutorial on `sphx_glr_auto_examples_core_run_similarity_queries.py`.\n\n## References\n\n.. [1] Bradford. 2008. An empirical study of required dimensionality for large-scale latent semantic indexing applications.\n\n.. [2] Hoffman, Blei, Bach. 2010. Online learning for Latent Dirichlet Allocation.\n\n.. [3] Wang, Paisley, Blei. 2011. Online variational inference for the hierarchical Dirichlet process.\n\n.. [4] Halko, Martinsson, Tropp. 2009. Finding structure with randomness.\n\n.. [5] \u0158eh\u016f\u0159ek. 2011. Subspace tracking for Latent Semantic Analysis.\n\n" ] }, { @@ -208,7 +208,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/docs/src/auto_examples/core/run_topics_and_transformations.py b/docs/src/auto_examples/core/run_topics_and_transformations.py index 605584084d..45888505e0 100644 --- a/docs/src/auto_examples/core/run_topics_and_transformations.py +++ b/docs/src/auto_examples/core/run_topics_and_transformations.py @@ -188,6 +188,20 @@ # # model = models.TfidfModel(corpus, normalize=True) # +# * `Okapi Best Matching, Okapi BM25 `_ +# expects a bag-of-words (integer values) training corpus during initialization. +# During transformation, it will take a vector and return another vector of the +# same dimensionality, except that features which were rare in the training corpus +# will have their value increased. It therefore converts integer-valued +# vectors into real-valued ones, while leaving the number of dimensions intact. +# +# Okapi BM25 is the standard ranking function used by search engines to estimate +# the relevance of documents to a given search query. +# +# .. sourcecode:: pycon +# +# model = models.OkapiBM25Model(corpus) +# # * `Latent Semantic Indexing, LSI (or sometimes LSA) `_ # transforms documents from either bag-of-words or (preferrably) TfIdf-weighted space into # a latent space of a lower dimensionality. For the toy corpus above we used only diff --git a/docs/src/auto_examples/core/run_topics_and_transformations.py.md5 b/docs/src/auto_examples/core/run_topics_and_transformations.py.md5 index 4ea3bee39d..ce683c931f 100644 --- a/docs/src/auto_examples/core/run_topics_and_transformations.py.md5 +++ b/docs/src/auto_examples/core/run_topics_and_transformations.py.md5 @@ -1 +1 @@ -f49c3821bbacdeefdf3945d5dcb5ad01 \ No newline at end of file +226db24f9e807e4bbd2a6ef280a75510 \ No newline at end of file diff --git a/docs/src/auto_examples/core/run_topics_and_transformations.rst b/docs/src/auto_examples/core/run_topics_and_transformations.rst index a5056ee4e3..64c9675939 100644 --- a/docs/src/auto_examples/core/run_topics_and_transformations.rst +++ b/docs/src/auto_examples/core/run_topics_and_transformations.rst @@ -1,12 +1,21 @@ + +.. DO NOT EDIT. +.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. +.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: +.. "auto_examples/core/run_topics_and_transformations.py" +.. LINE NUMBERS ARE GIVEN BELOW. + .. only:: html .. note:: :class: sphx-glr-download-link-note - Click :ref:`here ` to download the full example code - .. rst-class:: sphx-glr-example-title + Click :ref:`here ` + to download the full example code + +.. rst-class:: sphx-glr-example-title - .. _sphx_glr_auto_examples_core_run_topics_and_transformations.py: +.. _sphx_glr_auto_examples_core_run_topics_and_transformations.py: Topics and Transformations @@ -14,6 +23,7 @@ Topics and Transformations Introduces transformations and demonstrates their use on a toy corpus. +.. GENERATED FROM PYTHON SOURCE LINES 7-11 .. code-block:: default @@ -28,6 +38,8 @@ Introduces transformations and demonstrates their use on a toy corpus. +.. GENERATED FROM PYTHON SOURCE LINES 12-28 + In this tutorial, I will show how to transform documents from one vector representation into another. This process serves two goals: @@ -45,6 +57,7 @@ First, we need to create a corpus to work with. This step is the same as in the previous tutorial; if you completed it, feel free to skip to the next section. +.. GENERATED FROM PYTHON SOURCE LINES 28-65 .. code-block:: default @@ -89,8 +102,20 @@ if you completed it, feel free to skip to the next section. +.. rst-class:: sphx-glr-script-out + + Out: + + .. code-block:: none + + 2022-07-18 19:59:38,851 : INFO : adding document #0 to Dictionary<0 unique tokens: []> + 2022-07-18 19:59:38,852 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions) + 2022-07-18 19:59:38,853 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2022-07-18T19:59:38.852315', 'gensim': '4.2.1.dev0', 'python': '3.8.13 (default, Jul 12 2022, 12:32:46) \n[GCC 10.2.1 20210110]', 'platform': 'Linux-5.10.0-0.bpo.12-amd64-x86_64-with-glibc2.2.5', 'event': 'created'} + + +.. GENERATED FROM PYTHON SOURCE LINES 66-72 Creating a transformation ++++++++++++++++++++++++++ @@ -99,6 +124,7 @@ The transformations are standard Python objects, typically initialized by means a :dfn:`training corpus`: +.. GENERATED FROM PYTHON SOURCE LINES 73-77 .. code-block:: default @@ -110,9 +136,21 @@ a :dfn:`training corpus`: +.. rst-class:: sphx-glr-script-out + + Out: + + .. code-block:: none + + 2022-07-18 19:59:38,866 : INFO : collecting document frequencies + 2022-07-18 19:59:38,866 : INFO : PROGRESS: processing document #0 + 2022-07-18 19:59:38,868 : INFO : TfidfModel lifecycle event {'msg': 'calculated IDF weights for 9 documents and 12 features (28 matrix non-zeros)', 'datetime': '2022-07-18T19:59:38.868192', 'gensim': '4.2.1.dev0', 'python': '3.8.13 (default, Jul 12 2022, 12:32:46) \n[GCC 10.2.1 20210110]', 'platform': 'Linux-5.10.0-0.bpo.12-amd64-x86_64-with-glibc2.2.5', 'event': 'initialize'} + +.. GENERATED FROM PYTHON SOURCE LINES 78-100 + We used our old corpus from tutorial 1 to initialize (train) the transformation model. Different transformations may require different initialization parameters; in case of TfIdf, the "training" consists simply of going through the supplied corpus once and computing document frequencies @@ -136,6 +174,7 @@ From now on, ``tfidf`` is treated as a read-only object that can be used to conv any vector from the old representation (bag-of-words integer counts) to the new representation (TfIdf real-valued weights): +.. GENERATED FROM PYTHON SOURCE LINES 100-104 .. code-block:: default @@ -158,8 +197,11 @@ any vector from the old representation (bag-of-words integer counts) to the new +.. GENERATED FROM PYTHON SOURCE LINES 105-106 + Or to apply a transformation to a whole corpus: +.. GENERATED FROM PYTHON SOURCE LINES 106-111 .. code-block:: default @@ -191,6 +233,8 @@ Or to apply a transformation to a whole corpus: +.. GENERATED FROM PYTHON SOURCE LINES 112-128 + In this particular case, we are transforming the same corpus that we used for training, but this is only incidental. Once the transformation model has been initialized, it can be used on any vectors (provided they come from the same vector space, of course), @@ -208,6 +252,7 @@ folding-in for LSA, by topic inference for LDA etc. Transformations can also be serialized, one on top of another, in a sort of chain: +.. GENERATED FROM PYTHON SOURCE LINES 128-132 .. code-block:: default @@ -219,13 +264,36 @@ Transformations can also be serialized, one on top of another, in a sort of chai +.. rst-class:: sphx-glr-script-out + + Out: + + .. code-block:: none + + 2022-07-18 19:59:39,141 : INFO : using serial LSI version on this node + 2022-07-18 19:59:39,142 : INFO : updating model with new documents + 2022-07-18 19:59:39,143 : INFO : preparing a new chunk of documents + 2022-07-18 19:59:39,144 : INFO : using 100 extra samples and 2 power iterations + 2022-07-18 19:59:39,144 : INFO : 1st phase: constructing (12, 102) action matrix + 2022-07-18 19:59:39,146 : INFO : orthonormalizing (12, 102) action matrix + 2022-07-18 19:59:39,148 : INFO : 2nd phase: running dense svd on (12, 9) matrix + 2022-07-18 19:59:39,148 : INFO : computing the final decomposition + 2022-07-18 19:59:39,149 : INFO : keeping 2 factors (discarding 47.565% of energy spectrum) + 2022-07-18 19:59:39,150 : INFO : processed documents up to #9 + 2022-07-18 19:59:39,151 : INFO : topic #0(1.594): 0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface" + 2022-07-18 19:59:39,151 : INFO : topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees" + 2022-07-18 19:59:39,151 : INFO : LsiModel lifecycle event {'msg': 'trained LsiModel in 0.01s', 'datetime': '2022-07-18T19:59:39.151911', 'gensim': '4.2.1.dev0', 'python': '3.8.13 (default, Jul 12 2022, 12:32:46) \n[GCC 10.2.1 20210110]', 'platform': 'Linux-5.10.0-0.bpo.12-amd64-x86_64-with-glibc2.2.5', 'event': 'created'} + + +.. GENERATED FROM PYTHON SOURCE LINES 133-136 Here we transformed our Tf-Idf corpus via `Latent Semantic Indexing `_ into a latent 2-D space (2-D because we set ``num_topics=2``). Now you're probably wondering: what do these two latent dimensions stand for? Let's inspect with :func:`models.LsiModel.print_topics`: +.. GENERATED FROM PYTHON SOURCE LINES 136-139 .. code-block:: default @@ -242,11 +310,15 @@ dimensions stand for? Let's inspect with :func:`models.LsiModel.print_topics`: .. code-block:: none + 2022-07-18 19:59:39,298 : INFO : topic #0(1.594): 0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface" + 2022-07-18 19:59:39,298 : INFO : topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees" - [(0, '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'), (1, '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')] + [(0, '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'), (1, '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')] +.. GENERATED FROM PYTHON SOURCE LINES 140-148 + (the topics are printed to log -- see the note at the top of this page about activating logging) @@ -256,6 +328,7 @@ second topic practically concerns itself with all the other words. As expected, the first five documents are more strongly related to the second topic while the remaining four documents to the first topic: +.. GENERATED FROM PYTHON SOURCE LINES 148-153 .. code-block:: default @@ -274,21 +347,24 @@ remaining four documents to the first topic: .. code-block:: none - [(0, 0.06600783396090518), (1, -0.520070330636184)] Human machine interface for lab abc computer applications - [(0, 0.19667592859142694), (1, -0.7609563167700047)] A survey of user opinion of computer system response time - [(0, 0.08992639972446678), (1, -0.72418606267525)] The EPS user interface management system - [(0, 0.07585847652178407), (1, -0.6320551586003417)] System and human system engineering testing of EPS - [(0, 0.10150299184980252), (1, -0.5737308483002961)] Relation of user perceived response time to error measurement - [(0, 0.7032108939378307), (1, 0.16115180214025954)] The generation of random binary unordered trees - [(0, 0.8774787673119826), (1, 0.16758906864659615)] The intersection graph of paths in trees - [(0, 0.9098624686818572), (1, 0.14086553628719237)] Graph minors IV Widths of trees and well quasi ordering - [(0, 0.6165825350569278), (1, -0.05392907566389235)] Graph minors A survey + [(0, 0.06600783396090446), (1, -0.5200703306361851)] Human machine interface for lab abc computer applications + [(0, 0.19667592859142627), (1, -0.7609563167700037)] A survey of user opinion of computer system response time + [(0, 0.08992639972446514), (1, -0.724186062675251)] The EPS user interface management system + [(0, 0.07585847652178207), (1, -0.632055158600343)] System and human system engineering testing of EPS + [(0, 0.10150299184980262), (1, -0.5737308483002944)] Relation of user perceived response time to error measurement + [(0, 0.703210893937831), (1, 0.16115180214025884)] The generation of random binary unordered trees + [(0, 0.8774787673119828), (1, 0.16758906864659542)] The intersection graph of paths in trees + [(0, 0.9098624686818574), (1, 0.14086553628719167)] Graph minors IV Widths of trees and well quasi ordering + [(0, 0.6165825350569278), (1, -0.05392907566389242)] Graph minors A survey + +.. GENERATED FROM PYTHON SOURCE LINES 154-155 Model persistency is achieved with the :func:`save` and :func:`load` functions: +.. GENERATED FROM PYTHON SOURCE LINES 155-165 .. code-block:: default @@ -306,8 +382,30 @@ Model persistency is achieved with the :func:`save` and :func:`load` functions: +.. rst-class:: sphx-glr-script-out + + Out: + + .. code-block:: none + + 2022-07-18 19:59:39,441 : INFO : Projection lifecycle event {'fname_or_handle': '/tmp/model-rpai5uj5.lsi.projection', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-07-18T19:59:39.441811', 'gensim': '4.2.1.dev0', 'python': '3.8.13 (default, Jul 12 2022, 12:32:46) \n[GCC 10.2.1 20210110]', 'platform': 'Linux-5.10.0-0.bpo.12-amd64-x86_64-with-glibc2.2.5', 'event': 'saving'} + 2022-07-18 19:59:39,443 : INFO : saved /tmp/model-rpai5uj5.lsi.projection + 2022-07-18 19:59:39,443 : INFO : LsiModel lifecycle event {'fname_or_handle': '/tmp/model-rpai5uj5.lsi', 'separately': 'None', 'sep_limit': 10485760, 'ignore': ['projection', 'dispatcher'], 'datetime': '2022-07-18T19:59:39.443722', 'gensim': '4.2.1.dev0', 'python': '3.8.13 (default, Jul 12 2022, 12:32:46) \n[GCC 10.2.1 20210110]', 'platform': 'Linux-5.10.0-0.bpo.12-amd64-x86_64-with-glibc2.2.5', 'event': 'saving'} + 2022-07-18 19:59:39,444 : INFO : not storing attribute projection + 2022-07-18 19:59:39,444 : INFO : not storing attribute dispatcher + 2022-07-18 19:59:39,444 : INFO : saved /tmp/model-rpai5uj5.lsi + 2022-07-18 19:59:39,444 : INFO : loading LsiModel object from /tmp/model-rpai5uj5.lsi + 2022-07-18 19:59:39,445 : INFO : loading id2word recursively from /tmp/model-rpai5uj5.lsi.id2word.* with mmap=None + 2022-07-18 19:59:39,445 : INFO : setting ignored attribute projection to None + 2022-07-18 19:59:39,445 : INFO : setting ignored attribute dispatcher to None + 2022-07-18 19:59:39,445 : INFO : LsiModel lifecycle event {'fname': '/tmp/model-rpai5uj5.lsi', 'datetime': '2022-07-18T19:59:39.445641', 'gensim': '4.2.1.dev0', 'python': '3.8.13 (default, Jul 12 2022, 12:32:46) \n[GCC 10.2.1 20210110]', 'platform': 'Linux-5.10.0-0.bpo.12-amd64-x86_64-with-glibc2.2.5', 'event': 'loaded'} + 2022-07-18 19:59:39,445 : INFO : loading LsiModel object from /tmp/model-rpai5uj5.lsi.projection + 2022-07-18 19:59:39,446 : INFO : Projection lifecycle event {'fname': '/tmp/model-rpai5uj5.lsi.projection', 'datetime': '2022-07-18T19:59:39.446113', 'gensim': '4.2.1.dev0', 'python': '3.8.13 (default, Jul 12 2022, 12:32:46) \n[GCC 10.2.1 20210110]', 'platform': 'Linux-5.10.0-0.bpo.12-amd64-x86_64-with-glibc2.2.5', 'event': 'loaded'} + + +.. GENERATED FROM PYTHON SOURCE LINES 166-301 The next question might be: just how exactly similar are those documents to each other? Is there a way to formalize the similarity, so that for a given input document, we can @@ -334,6 +432,20 @@ Gensim implements several popular Vector Space Model algorithms: model = models.TfidfModel(corpus, normalize=True) +* `Okapi Best Matching, Okapi BM25 `_ + expects a bag-of-words (integer values) training corpus during initialization. + During transformation, it will take a vector and return another vector of the + same dimensionality, except that features which were rare in the training corpus + will have their value increased. It therefore converts integer-valued + vectors into real-valued ones, while leaving the number of dimensions intact. + + Okapi BM25 is the standard ranking function used by search engines to estimate + the relevance of documents to a given search query. + + .. sourcecode:: pycon + + model = models.OkapiBM25Model(corpus) + * `Latent Semantic Indexing, LSI (or sometimes LSA) `_ transforms documents from either bag-of-words or (preferrably) TfIdf-weighted space into a latent space of a lower dimensionality. For the toy corpus above we used only @@ -431,6 +543,7 @@ References .. [5] Řehůřek. 2011. Subspace tracking for Latent Semantic Analysis. +.. GENERATED FROM PYTHON SOURCE LINES 301-307 .. code-block:: default @@ -443,9 +556,10 @@ References -.. image:: /auto_examples/core/images/sphx_glr_run_topics_and_transformations_001.png - :alt: run topics and transformations - :class: sphx-glr-single-img +.. image-sg:: /auto_examples/core/images/sphx_glr_run_topics_and_transformations_001.png + :alt: run topics and transformations + :srcset: /auto_examples/core/images/sphx_glr_run_topics_and_transformations_001.png + :class: sphx-glr-single-img @@ -454,9 +568,9 @@ References .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 0 minutes 0.970 seconds) + **Total running time of the script:** ( 0 minutes 1.658 seconds) -**Estimated memory usage:** 7 MB +**Estimated memory usage:** 58 MB .. _sphx_glr_download_auto_examples_core_run_topics_and_transformations.py: diff --git a/docs/src/auto_examples/core/sg_execution_times.rst b/docs/src/auto_examples/core/sg_execution_times.rst index e206b6d636..26ebfe6747 100644 --- a/docs/src/auto_examples/core/sg_execution_times.rst +++ b/docs/src/auto_examples/core/sg_execution_times.rst @@ -5,14 +5,14 @@ Computation times ================= -**00:05.212** total execution time for **auto_examples_core** files: +**00:01.658** total execution time for **auto_examples_core** files: +--------------------------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` (``run_corpora_and_vector_spaces.py``) | 00:05.212 | 47.2 MB | +| :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` (``run_topics_and_transformations.py``) | 00:01.658 | 58.1 MB | +--------------------------------------------------------------------------------------------------------------+-----------+---------+ | :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` (``run_core_concepts.py``) | 00:00.000 | 0.0 MB | +--------------------------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` (``run_similarity_queries.py``) | 00:00.000 | 0.0 MB | +| :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` (``run_corpora_and_vector_spaces.py``) | 00:00.000 | 0.0 MB | +--------------------------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` (``run_topics_and_transformations.py``) | 00:00.000 | 0.0 MB | +| :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` (``run_similarity_queries.py``) | 00:00.000 | 0.0 MB | +--------------------------------------------------------------------------------------------------------------+-----------+---------+ diff --git a/docs/src/auto_examples/howtos/index.rst b/docs/src/auto_examples/howtos/index.rst new file mode 100644 index 0000000000..a43341d306 --- /dev/null +++ b/docs/src/auto_examples/howtos/index.rst @@ -0,0 +1,97 @@ + + +.. _sphx_glr_auto_examples_howtos: + +How-to Guides: Solve a Problem +------------------------------ + +These **goal-oriented guides** demonstrate how to **solve a specific problem** using gensim. + + + +.. raw:: html + +
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png + :alt: How to download pre-trained models and corpora + + :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` + +.. raw:: html + +
How to download pre-trained models and corpora
+
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png + :alt: How to Author Gensim Documentation + + :ref:`sphx_glr_auto_examples_howtos_run_doc.py` + +.. raw:: html + +
How to Author Gensim Documentation
+
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png + :alt: How to reproduce the doc2vec 'Paragraph Vector' paper + + :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` + +.. raw:: html + +
How to reproduce the doc2vec 'Paragraph Vector' paper
+
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png + :alt: How to Compare LDA Models + + :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` + +.. raw:: html + +
How to Compare LDA Models
+
+ + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /auto_examples/howtos/run_downloader_api + /auto_examples/howtos/run_doc + /auto_examples/howtos/run_doc2vec_imdb + /auto_examples/howtos/run_compare_lda + diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index 47819284b2..2d2f133d0c 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -1,23 +1,23 @@ :orphan: - - -.. _sphx_glr_auto_examples: - Documentation ============= +.. _gallery_top: + We welcome contributions to our documentation via GitHub pull requests, whether it's fixing a typo or authoring an entirely new tutorial or guide. If you're thinking about contributing documentation, please see :ref:`sphx_glr_auto_examples_howtos_run_doc.py`. + .. raw:: html -
+
+.. raw:: html -.. _sphx_glr_auto_examples_core: +
Core Tutorials: New Users Start Here! ------------------------------------- @@ -27,96 +27,82 @@ Understanding this functionality is vital for using gensim effectively. +.. raw:: html + +
+ + .. raw:: html
.. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png - :alt: Core Concepts + .. image:: /auto_examples/core/images/thumb/sphx_glr_run_core_concepts_thumb.png + :alt: Core Concepts - :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` + :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` .. raw:: html +
Core Concepts
-.. toctree:: - :hidden: - - /auto_examples/core/run_core_concepts - .. raw:: html
.. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png - :alt: Corpora and Vector Spaces + .. image:: /auto_examples/core/images/thumb/sphx_glr_run_corpora_and_vector_spaces_thumb.png + :alt: Corpora and Vector Spaces - :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` + :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` .. raw:: html +
Corpora and Vector Spaces
-.. toctree:: - :hidden: - - /auto_examples/core/run_corpora_and_vector_spaces - .. raw:: html
.. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png - :alt: Topics and Transformations + .. image:: /auto_examples/core/images/thumb/sphx_glr_run_topics_and_transformations_thumb.png + :alt: Topics and Transformations - :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` + :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` .. raw:: html +
Topics and Transformations
-.. toctree:: - :hidden: - - /auto_examples/core/run_topics_and_transformations - .. raw:: html
.. only:: html - .. figure:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png - :alt: Similarity Queries + .. image:: /auto_examples/core/images/thumb/sphx_glr_run_similarity_queries_thumb.png + :alt: Similarity Queries - :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` + :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` .. raw:: html +
Similarity Queries
-.. toctree:: - :hidden: - - /auto_examples/core/run_similarity_queries .. raw:: html -
- - - -.. _sphx_glr_auto_examples_tutorials: +
Tutorials: Learning Oriented Lessons ------------------------------------ @@ -125,180 +111,150 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod +.. raw:: html + +
+ + .. raw:: html
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png - :alt: Word2Vec Model + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png + :alt: Word2Vec Model - :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` + :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` .. raw:: html +
Word2Vec Model
-.. toctree:: - :hidden: - - /auto_examples/tutorials/run_word2vec - .. raw:: html
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png - :alt: Doc2Vec Model + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png + :alt: Doc2Vec Model - :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` + :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` .. raw:: html +
Doc2Vec Model
-.. toctree:: - :hidden: - - /auto_examples/tutorials/run_doc2vec_lee - .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png - :alt: Ensemble LDA + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png + :alt: FastText Model - :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` .. raw:: html +
FastText Model
-.. toctree:: - :hidden: - - /auto_examples/tutorials/run_ensemblelda - .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png - :alt: FastText Model + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png + :alt: Ensemble LDA - :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` + :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` .. raw:: html +
Ensemble LDA
-.. toctree:: - :hidden: - - /auto_examples/tutorials/run_fasttext - .. raw:: html
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png - :alt: Fast Similarity Queries with Annoy and Word2Vec + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png + :alt: Fast Similarity Queries with Annoy and Word2Vec - :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` + :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` .. raw:: html +
Fast Similarity Queries with Annoy and Word2Vec
-.. toctree:: - :hidden: - - /auto_examples/tutorials/run_annoy - .. raw:: html
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png - :alt: LDA Model + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png + :alt: LDA Model - :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` .. raw:: html +
LDA Model
-.. toctree:: - :hidden: - - /auto_examples/tutorials/run_lda - .. raw:: html
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png - :alt: Word Mover's Distance + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png + :alt: Word Mover's Distance - :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` + :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` .. raw:: html +
Word Mover's Distance
-.. toctree:: - :hidden: - - /auto_examples/tutorials/run_wmd - .. raw:: html
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_scm_thumb.png - :alt: Soft Cosine Measure + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_scm_thumb.png + :alt: Soft Cosine Measure - :ref:`sphx_glr_auto_examples_tutorials_run_scm.py` + :ref:`sphx_glr_auto_examples_tutorials_run_scm.py` .. raw:: html +
Soft Cosine Measure
-.. toctree:: - :hidden: - - /auto_examples/tutorials/run_scm .. raw:: html -
- - - -.. _sphx_glr_auto_examples_howtos: +
How-to Guides: Solve a Problem ------------------------------ @@ -307,96 +263,82 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u +.. raw:: html + +
+ + .. raw:: html
.. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png - :alt: How to download pre-trained models and corpora + .. image:: /auto_examples/howtos/images/thumb/sphx_glr_run_downloader_api_thumb.png + :alt: How to download pre-trained models and corpora - :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` + :ref:`sphx_glr_auto_examples_howtos_run_downloader_api.py` .. raw:: html +
How to download pre-trained models and corpora
-.. toctree:: - :hidden: - - /auto_examples/howtos/run_downloader_api - .. raw:: html
.. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png - :alt: How to Author Gensim Documentation + .. image:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc_thumb.png + :alt: How to Author Gensim Documentation - :ref:`sphx_glr_auto_examples_howtos_run_doc.py` + :ref:`sphx_glr_auto_examples_howtos_run_doc.py` .. raw:: html +
How to Author Gensim Documentation
-.. toctree:: - :hidden: - - /auto_examples/howtos/run_doc - .. raw:: html
.. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png - :alt: How to reproduce the doc2vec 'Paragraph Vector' paper + .. image:: /auto_examples/howtos/images/thumb/sphx_glr_run_doc2vec_imdb_thumb.png + :alt: How to reproduce the doc2vec 'Paragraph Vector' paper - :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` + :ref:`sphx_glr_auto_examples_howtos_run_doc2vec_imdb.py` .. raw:: html +
How to reproduce the doc2vec 'Paragraph Vector' paper
-.. toctree:: - :hidden: - - /auto_examples/howtos/run_doc2vec_imdb - .. raw:: html
.. only:: html - .. figure:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png - :alt: How to Compare LDA Models + .. image:: /auto_examples/howtos/images/thumb/sphx_glr_run_compare_lda_thumb.png + :alt: How to Compare LDA Models - :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` + :ref:`sphx_glr_auto_examples_howtos_run_compare_lda.py` .. raw:: html +
How to Compare LDA Models
-.. toctree:: - :hidden: - - /auto_examples/howtos/run_compare_lda .. raw:: html -
- - - -.. _sphx_glr_auto_examples_other: +
Other Resources --------------- @@ -433,27 +375,38 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from - ? `Deep Inverse Regression with Yelp Reviews `__ (Document Classification using Bayesian Inversion and several word2vec models, one for each class) + .. raw:: html -
+
+.. raw:: html -.. only :: html +
+ + +.. toctree:: + :hidden: + :includehidden: - .. container:: sphx-glr-footer - :class: sphx-glr-footer-gallery + /auto_examples/core/index.rst + /auto_examples/tutorials/index.rst + /auto_examples/howtos/index.rst + /auto_examples/other/index.rst - .. container:: sphx-glr-download sphx-glr-download-python +.. only:: html - :download:`Download all examples in Python source code: auto_examples_python.zip ` + .. container:: sphx-glr-footer sphx-glr-footer-gallery + .. container:: sphx-glr-download sphx-glr-download-python + :download:`Download all examples in Python source code: auto_examples_python.zip ` - .. container:: sphx-glr-download sphx-glr-download-jupyter + .. container:: sphx-glr-download sphx-glr-download-jupyter - :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` + :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` .. only:: html diff --git a/docs/src/auto_examples/other/index.rst b/docs/src/auto_examples/other/index.rst new file mode 100644 index 0000000000..c573b3902a --- /dev/null +++ b/docs/src/auto_examples/other/index.rst @@ -0,0 +1,49 @@ + + +.. _sphx_glr_auto_examples_other: + +Other Resources +--------------- + +Blog posts, tutorial videos, hackathons and other useful Gensim resources, from around the internet. + +- *Use FastText or Word2Vec?* Comparison of embedding quality and performance. `Jupyter Notebook `__ +- Multiword phrases extracted from *How I Met Your Mother*. `Blog post by Mark Needham `__ +- *Using Gensim LDA for hierarchical document clustering*. `Jupyter notebook by Brandon Rose `__ +- *Evolution of Voldemort topic through the 7 Harry Potter books*. `Blog post `__ +- *Movie plots by genre*: Document classification using various techniques: TF-IDF, word2vec averaging, Deep IR, Word Movers Distance and doc2vec. `Github repo `__ +- *Word2vec: Faster than Google? Optimization lessons in Python*, talk by Radim Řehůřek at PyData Berlin 2014. `Youtube video `__ +- *Word2vec & friends*, talk by Radim Řehůřek at MLMU.cz 7.1.2015. `Youtube video `__ + +.. + - ? `Making an Impact with NLP `__ -- Pycon 2016 Tutorial by Hobsons Lane + - ? `NLP with NLTK and Gensim `__ -- Pycon 2016 Tutorial by Tony Ojeda, Benjamin Bengfort, Laura Lorenz from District Data Labs + - ? `Word Embeddings for Fun and Profit `__ -- Talk at PyData London 2016 talk by Lev Konstantinovskiy. See accompanying `repo `__ + - ? English Wikipedia; TODO: convert to proper .py format + - ? `Colouring words by topic in a document, print words in a + topics `__ + - ? `Topic Coherence, a metric that correlates that human judgement on topic quality. `__ + - ? `America's Next Topic Model slides `__ + - How to choose your next topic model, presented at Pydata Berlin 10 August 2016 by Lev Konstantinovsky + - ? `Dynamic Topic Modeling and Dynamic Influence Model Tutorial `__ + - ? `Python Dynamic Topic Modelling Theory and Tutorial `__ + - ? `Word Movers Distance for Yelp Reviews tutorial `__ + - FIXME WMD superceded by soft cosine similarity = faster and better? any numbers / tutorials for that? + - ? `Great illustration of corpus preparation `__, `Code `__ + - ? `Alternative `__, + - ? `Alternative 2 `__ + - ? `Doc2Vec on customer reviews `__ + - ? `Doc2Vec on Airline Tweets Sentiment Analysis `__ + - ? `Deep Inverse Regression with Yelp Reviews `__ (Document Classification using Bayesian Inversion and several word2vec models, one for each class) + + + +.. raw:: html + +
+ + +.. raw:: html + +
+ diff --git a/docs/src/auto_examples/tutorials/images/sphx_glr_run_fasttext_001.png b/docs/src/auto_examples/tutorials/images/sphx_glr_run_fasttext_001.png index 9d95c4d8f0..9994f9312b 100644 Binary files a/docs/src/auto_examples/tutorials/images/sphx_glr_run_fasttext_001.png and b/docs/src/auto_examples/tutorials/images/sphx_glr_run_fasttext_001.png differ diff --git a/docs/src/auto_examples/tutorials/images/sphx_glr_run_wmd_001.png b/docs/src/auto_examples/tutorials/images/sphx_glr_run_wmd_001.png index 7d7ea7db56..d321d6e8ec 100644 Binary files a/docs/src/auto_examples/tutorials/images/sphx_glr_run_wmd_001.png and b/docs/src/auto_examples/tutorials/images/sphx_glr_run_wmd_001.png differ diff --git a/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png b/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png index 7cd540a896..74b0c057f3 100644 Binary files a/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png and b/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png differ diff --git a/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png b/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png index 381d4a9d59..9732c8e096 100644 Binary files a/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png and b/docs/src/auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png differ diff --git a/docs/src/auto_examples/tutorials/index.rst b/docs/src/auto_examples/tutorials/index.rst new file mode 100644 index 0000000000..d33cb95a53 --- /dev/null +++ b/docs/src/auto_examples/tutorials/index.rst @@ -0,0 +1,169 @@ + + +.. _sphx_glr_auto_examples_tutorials: + +Tutorials: Learning Oriented Lessons +------------------------------------ + +Learning-oriented lessons that introduce a particular gensim feature, e.g. a model (Word2Vec, FastText) or technique (similarity queries or text summarization). + + + +.. raw:: html + +
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_word2vec_thumb.png + :alt: Word2Vec Model + + :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` + +.. raw:: html + +
Word2Vec Model
+
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_doc2vec_lee_thumb.png + :alt: Doc2Vec Model + + :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` + +.. raw:: html + +
Doc2Vec Model
+
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png + :alt: FastText Model + + :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` + +.. raw:: html + +
FastText Model
+
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png + :alt: Ensemble LDA + + :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` + +.. raw:: html + +
Ensemble LDA
+
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_annoy_thumb.png + :alt: Fast Similarity Queries with Annoy and Word2Vec + + :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` + +.. raw:: html + +
Fast Similarity Queries with Annoy and Word2Vec
+
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_lda_thumb.png + :alt: LDA Model + + :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` + +.. raw:: html + +
LDA Model
+
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png + :alt: Word Mover's Distance + + :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` + +.. raw:: html + +
Word Mover's Distance
+
+ + +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/tutorials/images/thumb/sphx_glr_run_scm_thumb.png + :alt: Soft Cosine Measure + + :ref:`sphx_glr_auto_examples_tutorials_run_scm.py` + +.. raw:: html + +
Soft Cosine Measure
+
+ + +.. raw:: html + +
+ + +.. toctree:: + :hidden: + + /auto_examples/tutorials/run_word2vec + /auto_examples/tutorials/run_doc2vec_lee + /auto_examples/tutorials/run_fasttext + /auto_examples/tutorials/run_ensemblelda + /auto_examples/tutorials/run_annoy + /auto_examples/tutorials/run_lda + /auto_examples/tutorials/run_wmd + /auto_examples/tutorials/run_scm + diff --git a/docs/src/auto_examples/tutorials/run_doc2vec_lee.ipynb b/docs/src/auto_examples/tutorials/run_doc2vec_lee.ipynb index 5314bea335..a886f2f526 100644 --- a/docs/src/auto_examples/tutorials/run_doc2vec_lee.ipynb +++ b/docs/src/auto_examples/tutorials/run_doc2vec_lee.ipynb @@ -15,7 +15,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\nDoc2Vec Model\n=============\n\nIntroduces Gensim's Doc2Vec model and demonstrates its use on the\n`Lee Corpus `__.\n\n\n" + "\n# Doc2Vec Model\n\nIntroduces Gensim's Doc2Vec model and demonstrates its use on the\n[Lee Corpus](https://hekyll.services.adelaide.edu.au/dspace/bitstream/2440/28910/1/hdl_28910.pdf)_.\n" ] }, { @@ -33,7 +33,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Doc2Vec is a `core_concepts_model` that represents each\n`core_concepts_document` as a `core_concepts_vector`. This\ntutorial introduces the model and demonstrates how to train and assess it.\n\nHere's a list of what we'll be doing:\n\n0. Review the relevant models: bag-of-words, Word2Vec, Doc2Vec\n1. Load and preprocess the training and test corpora (see `core_concepts_corpus`)\n2. Train a Doc2Vec `core_concepts_model` model using the training corpus\n3. Demonstrate how the trained model can be used to infer a `core_concepts_vector`\n4. Assess the model\n5. Test the model on the test corpus\n\nReview: Bag-of-words\n--------------------\n\n.. Note:: Feel free to skip these review sections if you're already familiar with the models.\n\nYou may be familiar with the `bag-of-words model\n`_ from the\n`core_concepts_vector` section.\nThis model transforms each document to a fixed-length vector of integers.\nFor example, given the sentences:\n\n- ``John likes to watch movies. Mary likes movies too.``\n- ``John also likes to watch football games. Mary hates football.``\n\nThe model outputs the vectors:\n\n- ``[1, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0]``\n- ``[1, 1, 1, 1, 0, 1, 0, 1, 2, 1, 1]``\n\nEach vector has 10 elements, where each element counts the number of times a\nparticular word occurred in the document.\nThe order of elements is arbitrary.\nIn the example above, the order of the elements corresponds to the words:\n``[\"John\", \"likes\", \"to\", \"watch\", \"movies\", \"Mary\", \"too\", \"also\", \"football\", \"games\", \"hates\"]``.\n\nBag-of-words models are surprisingly effective, but have several weaknesses.\n\nFirst, they lose all information about word order: \"John likes Mary\" and\n\"Mary likes John\" correspond to identical vectors. There is a solution: bag\nof `n-grams `__\nmodels consider word phrases of length n to represent documents as\nfixed-length vectors to capture local word order but suffer from data\nsparsity and high dimensionality.\n\nSecond, the model does not attempt to learn the meaning of the underlying\nwords, and as a consequence, the distance between vectors doesn't always\nreflect the difference in meaning. The ``Word2Vec`` model addresses this\nsecond problem.\n\nReview: ``Word2Vec`` Model\n--------------------------\n\n``Word2Vec`` is a more recent model that embeds words in a lower-dimensional\nvector space using a shallow neural network. The result is a set of\nword-vectors where vectors close together in vector space have similar\nmeanings based on context, and word-vectors distant to each other have\ndiffering meanings. For example, ``strong`` and ``powerful`` would be close\ntogether and ``strong`` and ``Paris`` would be relatively far.\n\nGensim's :py:class:`~gensim.models.word2vec.Word2Vec` class implements this model.\n\nWith the ``Word2Vec`` model, we can calculate the vectors for each **word** in a document.\nBut what if we want to calculate a vector for the **entire document**\\ ?\nWe could average the vectors for each word in the document - while this is quick and crude, it can often be useful.\nHowever, there is a better way...\n\nIntroducing: Paragraph Vector\n-----------------------------\n\n.. Important:: In Gensim, we refer to the Paragraph Vector model as ``Doc2Vec``.\n\nLe and Mikolov in 2014 introduced the `Doc2Vec algorithm `__,\nwhich usually outperforms such simple-averaging of ``Word2Vec`` vectors.\n\nThe basic idea is: act as if a document has another floating word-like\nvector, which contributes to all training predictions, and is updated like\nother word-vectors, but we will call it a doc-vector. Gensim's\n:py:class:`~gensim.models.doc2vec.Doc2Vec` class implements this algorithm.\n\nThere are two implementations:\n\n1. Paragraph Vector - Distributed Memory (PV-DM)\n2. Paragraph Vector - Distributed Bag of Words (PV-DBOW)\n\n.. Important::\n Don't let the implementation details below scare you.\n They're advanced material: if it's too much, then move on to the next section.\n\nPV-DM is analogous to Word2Vec CBOW. The doc-vectors are obtained by training\na neural network on the synthetic task of predicting a center word based an\naverage of both context word-vectors and the full document's doc-vector.\n\nPV-DBOW is analogous to Word2Vec SG. The doc-vectors are obtained by training\na neural network on the synthetic task of predicting a target word just from\nthe full document's doc-vector. (It is also common to combine this with\nskip-gram testing, using both the doc-vector and nearby word-vectors to\npredict a single target word, but only one at a time.)\n\nPrepare the Training and Test Data\n----------------------------------\n\nFor this tutorial, we'll be training our model using the `Lee Background\nCorpus\n`_\nincluded in gensim. This corpus contains 314 documents selected from the\nAustralian Broadcasting Corporation\u2019s news mail service, which provides text\ne-mails of headline stories and covers a number of broad topics.\n\nAnd we'll test our model by eye using the much shorter `Lee Corpus\n`_\nwhich contains 50 documents.\n\n\n" + "Doc2Vec is a `core_concepts_model` that represents each\n`core_concepts_document` as a `core_concepts_vector`. This\ntutorial introduces the model and demonstrates how to train and assess it.\n\nHere's a list of what we'll be doing:\n\n0. Review the relevant models: bag-of-words, Word2Vec, Doc2Vec\n1. Load and preprocess the training and test corpora (see `core_concepts_corpus`)\n2. Train a Doc2Vec `core_concepts_model` model using the training corpus\n3. Demonstrate how the trained model can be used to infer a `core_concepts_vector`\n4. Assess the model\n5. Test the model on the test corpus\n\n## Review: Bag-of-words\n\n.. Note:: Feel free to skip these review sections if you're already familiar with the models.\n\nYou may be familiar with the [bag-of-words model](https://en.wikipedia.org/wiki/Bag-of-words_model) from the\n`core_concepts_vector` section.\nThis model transforms each document to a fixed-length vector of integers.\nFor example, given the sentences:\n\n- ``John likes to watch movies. Mary likes movies too.``\n- ``John also likes to watch football games. Mary hates football.``\n\nThe model outputs the vectors:\n\n- ``[1, 2, 1, 1, 2, 1, 1, 0, 0, 0, 0]``\n- ``[1, 1, 1, 1, 0, 1, 0, 1, 2, 1, 1]``\n\nEach vector has 10 elements, where each element counts the number of times a\nparticular word occurred in the document.\nThe order of elements is arbitrary.\nIn the example above, the order of the elements corresponds to the words:\n``[\"John\", \"likes\", \"to\", \"watch\", \"movies\", \"Mary\", \"too\", \"also\", \"football\", \"games\", \"hates\"]``.\n\nBag-of-words models are surprisingly effective, but have several weaknesses.\n\nFirst, they lose all information about word order: \"John likes Mary\" and\n\"Mary likes John\" correspond to identical vectors. There is a solution: bag\nof [n-grams](https://en.wikipedia.org/wiki/N-gram)_\nmodels consider word phrases of length n to represent documents as\nfixed-length vectors to capture local word order but suffer from data\nsparsity and high dimensionality.\n\nSecond, the model does not attempt to learn the meaning of the underlying\nwords, and as a consequence, the distance between vectors doesn't always\nreflect the difference in meaning. The ``Word2Vec`` model addresses this\nsecond problem.\n\n## Review: ``Word2Vec`` Model\n\n``Word2Vec`` is a more recent model that embeds words in a lower-dimensional\nvector space using a shallow neural network. The result is a set of\nword-vectors where vectors close together in vector space have similar\nmeanings based on context, and word-vectors distant to each other have\ndiffering meanings. For example, ``strong`` and ``powerful`` would be close\ntogether and ``strong`` and ``Paris`` would be relatively far.\n\nGensim's :py:class:`~gensim.models.word2vec.Word2Vec` class implements this model.\n\nWith the ``Word2Vec`` model, we can calculate the vectors for each **word** in a document.\nBut what if we want to calculate a vector for the **entire document**\\ ?\nWe could average the vectors for each word in the document - while this is quick and crude, it can often be useful.\nHowever, there is a better way...\n\n## Introducing: Paragraph Vector\n\n.. Important:: In Gensim, we refer to the Paragraph Vector model as ``Doc2Vec``.\n\nLe and Mikolov in 2014 introduced the [Doc2Vec algorithm](https://cs.stanford.edu/~quocle/paragraph_vector.pdf)_,\nwhich usually outperforms such simple-averaging of ``Word2Vec`` vectors.\n\nThe basic idea is: act as if a document has another floating word-like\nvector, which contributes to all training predictions, and is updated like\nother word-vectors, but we will call it a doc-vector. Gensim's\n:py:class:`~gensim.models.doc2vec.Doc2Vec` class implements this algorithm.\n\nThere are two implementations:\n\n1. Paragraph Vector - Distributed Memory (PV-DM)\n2. Paragraph Vector - Distributed Bag of Words (PV-DBOW)\n\n.. Important::\n Don't let the implementation details below scare you.\n They're advanced material: if it's too much, then move on to the next section.\n\nPV-DM is analogous to Word2Vec CBOW. The doc-vectors are obtained by training\na neural network on the synthetic task of predicting a center word based an\naverage of both context word-vectors and the full document's doc-vector.\n\nPV-DBOW is analogous to Word2Vec SG. The doc-vectors are obtained by training\na neural network on the synthetic task of predicting a target word just from\nthe full document's doc-vector. (It is also common to combine this with\nskip-gram testing, using both the doc-vector and nearby word-vectors to\npredict a single target word, but only one at a time.)\n\n## Prepare the Training and Test Data\n\nFor this tutorial, we'll be training our model using the [Lee Background\nCorpus](https://hekyll.services.adelaide.edu.au/dspace/bitstream/2440/28910/1/hdl_28910.pdf)\nincluded in gensim. This corpus contains 314 documents selected from the\nAustralian Broadcasting Corporation\u2019s news mail service, which provides text\ne-mails of headline stories and covers a number of broad topics.\n\nAnd we'll test our model by eye using the much shorter [Lee Corpus](https://hekyll.services.adelaide.edu.au/dspace/bitstream/2440/28910/1/hdl_28910.pdf)\nwhich contains 50 documents.\n\n\n" ] }, { @@ -51,7 +51,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Define a Function to Read and Preprocess Text\n---------------------------------------------\n\nBelow, we define a function to:\n\n- open the train/test file (with latin encoding)\n- read the file line-by-line\n- pre-process each line (tokenize text into individual words, remove punctuation, set to lowercase, etc)\n\nThe file we're reading is a **corpus**.\nEach line of the file is a **document**.\n\n.. Important::\n To train the model, we'll need to associate a tag/number with each document\n of the training corpus. In our case, the tag is simply the zero-based line\n number.\n\n\n" + "## Define a Function to Read and Preprocess Text\n\nBelow, we define a function to:\n\n- open the train/test file (with latin encoding)\n- read the file line-by-line\n- pre-process each line (tokenize text into individual words, remove punctuation, set to lowercase, etc)\n\nThe file we're reading is a **corpus**.\nEach line of the file is a **document**.\n\n.. Important::\n To train the model, we'll need to associate a tag/number with each document\n of the training corpus. In our case, the tag is simply the zero-based line\n number.\n\n\n" ] }, { @@ -112,7 +112,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Training the Model\n------------------\n\nNow, we'll instantiate a Doc2Vec model with a vector size with 50 dimensions and\niterating over the training corpus 40 times. We set the minimum word count to\n2 in order to discard words with very few occurrences. (Without a variety of\nrepresentative examples, retaining such infrequent words can often make a\nmodel worse!) Typical iteration counts in the published `Paragraph Vector paper `__\nresults, using 10s-of-thousands to millions of docs, are 10-20. More\niterations take more time and eventually reach a point of diminishing\nreturns.\n\nHowever, this is a very very small dataset (300 documents) with shortish\ndocuments (a few hundred words). Adding training passes can sometimes help\nwith such small datasets.\n\n\n" + "## Training the Model\n\nNow, we'll instantiate a Doc2Vec model with a vector size with 50 dimensions and\niterating over the training corpus 40 times. We set the minimum word count to\n2 in order to discard words with very few occurrences. (Without a variety of\nrepresentative examples, retaining such infrequent words can often make a\nmodel worse!) Typical iteration counts in the published [Paragraph Vector paper](https://cs.stanford.edu/~quocle/paragraph_vector.pdf)_\nresults, using 10s-of-thousands to millions of docs, are 10-20. More\niterations take more time and eventually reach a point of diminishing\nreturns.\n\nHowever, this is a very very small dataset (300 documents) with shortish\ndocuments (a few hundred words). Adding training passes can sometimes help\nwith such small datasets.\n\n\n" ] }, { @@ -166,7 +166,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Next, train the model on the corpus.\nIf optimized Gensim (with BLAS library) is being used, this should take no more than 3 seconds.\nIf the BLAS library is not being used, this should take no more than 2\nminutes, so use optimized Gensim with BLAS if you value your time.\n\n\n" + "Next, train the model on the corpus.\nIn the usual case, where Gensim installation found a BLAS library for optimized\nbulk vector operations, this training on this tiny 300 document, ~60k word corpus \nshould take just a few seconds. (More realistic datasets of tens-of-millions\nof words or more take proportionately longer.) If for some reason a BLAS library \nisn't available, training uses a fallback approach that takes 60x-120x longer, \nso even this tiny training will take minutes rather than seconds. (And, in that \ncase, you should also notice a warning in the logging letting you know there's \nsomething worth fixing.) So, be sure your installation uses the BLAS-optimized \nGensim if you value your time.\n\n\n" ] }, { @@ -209,7 +209,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Assessing the Model\n-------------------\n\nTo assess our new model, we'll first infer new vectors for each document of\nthe training corpus, compare the inferred vectors with the training corpus,\nand then returning the rank of the document based on self-similarity.\nBasically, we're pretending as if the training corpus is some new unseen data\nand then seeing how they compare with the trained model. The expectation is\nthat we've likely overfit our model (i.e., all of the ranks will be less than\n2) and so we should be able to find similar documents very easily.\nAdditionally, we'll keep track of the second ranks for a comparison of less\nsimilar documents.\n\n\n" + "## Assessing the Model\n\nTo assess our new model, we'll first infer new vectors for each document of\nthe training corpus, compare the inferred vectors with the training corpus,\nand then returning the rank of the document based on self-similarity.\nBasically, we're pretending as if the training corpus is some new unseen data\nand then seeing how they compare with the trained model. The expectation is\nthat we've likely overfit our model (i.e., all of the ranks will be less than\n2) and so we should be able to find similar documents very easily.\nAdditionally, we'll keep track of the second ranks for a comparison of less\nsimilar documents.\n\n\n" ] }, { @@ -281,7 +281,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Testing the Model\n-----------------\n\nUsing the same approach above, we'll infer the vector for a randomly chosen\ntest document, and compare the document to our model by eye.\n\n\n" + "## Testing the Model\n\nUsing the same approach above, we'll infer the vector for a randomly chosen\ntest document, and compare the document to our model by eye.\n\n\n" ] }, { @@ -299,7 +299,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Conclusion\n----------\n\nLet's review what we've seen in this tutorial:\n\n0. Review the relevant models: bag-of-words, Word2Vec, Doc2Vec\n1. Load and preprocess the training and test corpora (see `core_concepts_corpus`)\n2. Train a Doc2Vec `core_concepts_model` model using the training corpus\n3. Demonstrate how the trained model can be used to infer a `core_concepts_vector`\n4. Assess the model\n5. Test the model on the test corpus\n\nThat's it! Doc2Vec is a great way to explore relationships between documents.\n\nAdditional Resources\n--------------------\n\nIf you'd like to know more about the subject matter of this tutorial, check out the links below.\n\n* `Word2Vec Paper `_\n* `Doc2Vec Paper `_\n* `Dr. Michael D. Lee's Website `_\n* `Lee Corpus `__\n* `IMDB Doc2Vec Tutorial `_\n\n\n" + "## Conclusion\n\nLet's review what we've seen in this tutorial:\n\n0. Review the relevant models: bag-of-words, Word2Vec, Doc2Vec\n1. Load and preprocess the training and test corpora (see `core_concepts_corpus`)\n2. Train a Doc2Vec `core_concepts_model` model using the training corpus\n3. Demonstrate how the trained model can be used to infer a `core_concepts_vector`\n4. Assess the model\n5. Test the model on the test corpus\n\nThat's it! Doc2Vec is a great way to explore relationships between documents.\n\n## Additional Resources\n\nIf you'd like to know more about the subject matter of this tutorial, check out the links below.\n\n* [Word2Vec Paper](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)\n* [Doc2Vec Paper](https://cs.stanford.edu/~quocle/paragraph_vector.pdf)\n* [Dr. Michael D. Lee's Website](http://faculty.sites.uci.edu/mdlee)\n* [Lee Corpus](http://faculty.sites.uci.edu/mdlee/similarity-data/)_\n* [IMDB Doc2Vec Tutorial](doc2vec-IMDB.ipynb)\n\n\n" ] } ], @@ -319,7 +319,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/docs/src/auto_examples/tutorials/run_doc2vec_lee.py b/docs/src/auto_examples/tutorials/run_doc2vec_lee.py index 7012d38f66..18f4ee7b16 100644 --- a/docs/src/auto_examples/tutorials/run_doc2vec_lee.py +++ b/docs/src/auto_examples/tutorials/run_doc2vec_lee.py @@ -215,9 +215,15 @@ def read_corpus(fname, tokens_only=False): ############################################################################### # Next, train the model on the corpus. -# If optimized Gensim (with BLAS library) is being used, this should take no more than 3 seconds. -# If the BLAS library is not being used, this should take no more than 2 -# minutes, so use optimized Gensim with BLAS if you value your time. +# In the usual case, where Gensim installation found a BLAS library for optimized +# bulk vector operations, this training on this tiny 300 document, ~60k word corpus +# should take just a few seconds. (More realistic datasets of tens-of-millions +# of words or more take proportionately longer.) If for some reason a BLAS library +# isn't available, training uses a fallback approach that takes 60x-120x longer, +# so even this tiny training will take minutes rather than seconds. (And, in that +# case, you should also notice a warning in the logging letting you know there's +# something worth fixing.) So, be sure your installation uses the BLAS-optimized +# Gensim if you value your time. # model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs) diff --git a/docs/src/auto_examples/tutorials/run_doc2vec_lee.py.md5 b/docs/src/auto_examples/tutorials/run_doc2vec_lee.py.md5 index f1b58e756c..5c0d021557 100644 --- a/docs/src/auto_examples/tutorials/run_doc2vec_lee.py.md5 +++ b/docs/src/auto_examples/tutorials/run_doc2vec_lee.py.md5 @@ -1 +1 @@ -7d0ee86f6eb9d1e2f55b9f295eec3060 \ No newline at end of file +581caa67e8496a210a030c2886fb8bbc \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_doc2vec_lee.rst b/docs/src/auto_examples/tutorials/run_doc2vec_lee.rst index 6e99a47a13..68a6fc7d3f 100644 --- a/docs/src/auto_examples/tutorials/run_doc2vec_lee.rst +++ b/docs/src/auto_examples/tutorials/run_doc2vec_lee.rst @@ -1,12 +1,21 @@ + +.. DO NOT EDIT. +.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. +.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: +.. "auto_examples/tutorials/run_doc2vec_lee.py" +.. LINE NUMBERS ARE GIVEN BELOW. + .. only:: html .. note:: :class: sphx-glr-download-link-note - Click :ref:`here ` to download the full example code - .. rst-class:: sphx-glr-example-title + Click :ref:`here ` + to download the full example code + +.. rst-class:: sphx-glr-example-title - .. _sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py: +.. _sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py: Doc2Vec Model @@ -15,7 +24,7 @@ Doc2Vec Model Introduces Gensim's Doc2Vec model and demonstrates its use on the `Lee Corpus `__. - +.. GENERATED FROM PYTHON SOURCE LINES 9-13 .. code-block:: default @@ -30,6 +39,8 @@ Introduces Gensim's Doc2Vec model and demonstrates its use on the +.. GENERATED FROM PYTHON SOURCE LINES 14-129 + Doc2Vec is a :ref:`core_concepts_model` that represents each :ref:`core_concepts_document` as a :ref:`core_concepts_vector`. This tutorial introduces the model and demonstrates how to train and assess it. @@ -146,6 +157,7 @@ And we'll test our model by eye using the much shorter `Lee Corpus which contains 50 documents. +.. GENERATED FROM PYTHON SOURCE LINES 129-137 .. code-block:: default @@ -164,6 +176,8 @@ which contains 50 documents. +.. GENERATED FROM PYTHON SOURCE LINES 138-155 + Define a Function to Read and Preprocess Text --------------------------------------------- @@ -182,6 +196,7 @@ Each line of the file is a **document**. number. +.. GENERATED FROM PYTHON SOURCE LINES 155-170 .. code-block:: default @@ -207,9 +222,12 @@ Each line of the file is a **document**. +.. GENERATED FROM PYTHON SOURCE LINES 171-173 + Let's take a look at the training corpus +.. GENERATED FROM PYTHON SOURCE LINES 173-175 .. code-block:: default @@ -221,8 +239,6 @@ Let's take a look at the training corpus .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none [TaggedDocument(words=['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', 'caused', 'the', 'fire', 'to', 'burn', 'in', 'finger', 'formation', 'have', 'now', 'eased', 'and', 'about', 'fire', 'units', 'in', 'and', 'around', 'hill', 'top', 'are', 'optimistic', 'of', 'defending', 'all', 'properties', 'as', 'more', 'than', 'blazes', 'burn', 'on', 'new', 'year', 'eve', 'in', 'new', 'south', 'wales', 'fire', 'crews', 'have', 'been', 'called', 'to', 'new', 'fire', 'at', 'gunning', 'south', 'of', 'goulburn', 'while', 'few', 'details', 'are', 'available', 'at', 'this', 'stage', 'fire', 'authorities', 'says', 'it', 'has', 'closed', 'the', 'hume', 'highway', 'in', 'both', 'directions', 'meanwhile', 'new', 'fire', 'in', 'sydney', 'west', 'is', 'no', 'longer', 'threatening', 'properties', 'in', 'the', 'cranebrook', 'area', 'rain', 'has', 'fallen', 'in', 'some', 'parts', 'of', 'the', 'illawarra', 'sydney', 'the', 'hunter', 'valley', 'and', 'the', 'north', 'coast', 'but', 'the', 'bureau', 'of', 'meteorology', 'claire', 'richards', 'says', 'the', 'rain', 'has', 'done', 'little', 'to', 'ease', 'any', 'of', 'the', 'hundred', 'fires', 'still', 'burning', 'across', 'the', 'state', 'the', 'falls', 'have', 'been', 'quite', 'isolated', 'in', 'those', 'areas', 'and', 'generally', 'the', 'falls', 'have', 'been', 'less', 'than', 'about', 'five', 'millimetres', 'she', 'said', 'in', 'some', 'places', 'really', 'not', 'significant', 'at', 'all', 'less', 'than', 'millimetre', 'so', 'there', 'hasn', 'been', 'much', 'relief', 'as', 'far', 'as', 'rain', 'is', 'concerned', 'in', 'fact', 'they', 've', 'probably', 'hampered', 'the', 'efforts', 'of', 'the', 'firefighters', 'more', 'because', 'of', 'the', 'wind', 'gusts', 'that', 'are', 'associated', 'with', 'those', 'thunderstorms'], tags=[0]), TaggedDocument(words=['indian', 'security', 'forces', 'have', 'shot', 'dead', 'eight', 'suspected', 'militants', 'in', 'night', 'long', 'encounter', 'in', 'southern', 'kashmir', 'the', 'shootout', 'took', 'place', 'at', 'dora', 'village', 'some', 'kilometers', 'south', 'of', 'the', 'kashmiri', 'summer', 'capital', 'srinagar', 'the', 'deaths', 'came', 'as', 'pakistani', 'police', 'arrested', 'more', 'than', 'two', 'dozen', 'militants', 'from', 'extremist', 'groups', 'accused', 'of', 'staging', 'an', 'attack', 'on', 'india', 'parliament', 'india', 'has', 'accused', 'pakistan', 'based', 'lashkar', 'taiba', 'and', 'jaish', 'mohammad', 'of', 'carrying', 'out', 'the', 'attack', 'on', 'december', 'at', 'the', 'behest', 'of', 'pakistani', 'military', 'intelligence', 'military', 'tensions', 'have', 'soared', 'since', 'the', 'raid', 'with', 'both', 'sides', 'massing', 'troops', 'along', 'their', 'border', 'and', 'trading', 'tit', 'for', 'tat', 'diplomatic', 'sanctions', 'yesterday', 'pakistan', 'announced', 'it', 'had', 'arrested', 'lashkar', 'taiba', 'chief', 'hafiz', 'mohammed', 'saeed', 'police', 'in', 'karachi', 'say', 'it', 'is', 'likely', 'more', 'raids', 'will', 'be', 'launched', 'against', 'the', 'two', 'groups', 'as', 'well', 'as', 'other', 'militant', 'organisations', 'accused', 'of', 'targetting', 'india', 'military', 'tensions', 'between', 'india', 'and', 'pakistan', 'have', 'escalated', 'to', 'level', 'not', 'seen', 'since', 'their', 'war'], tags=[1])] @@ -230,9 +246,12 @@ Let's take a look at the training corpus +.. GENERATED FROM PYTHON SOURCE LINES 176-178 + And the testing corpus looks like this: +.. GENERATED FROM PYTHON SOURCE LINES 178-180 .. code-block:: default @@ -244,8 +263,6 @@ And the testing corpus looks like this: .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none [['the', 'national', 'executive', 'of', 'the', 'strife', 'torn', 'democrats', 'last', 'night', 'appointed', 'little', 'known', 'west', 'australian', 'senator', 'brian', 'greig', 'as', 'interim', 'leader', 'shock', 'move', 'likely', 'to', 'provoke', 'further', 'conflict', 'between', 'the', 'party', 'senators', 'and', 'its', 'organisation', 'in', 'move', 'to', 'reassert', 'control', 'over', 'the', 'party', 'seven', 'senators', 'the', 'national', 'executive', 'last', 'night', 'rejected', 'aden', 'ridgeway', 'bid', 'to', 'become', 'interim', 'leader', 'in', 'favour', 'of', 'senator', 'greig', 'supporter', 'of', 'deposed', 'leader', 'natasha', 'stott', 'despoja', 'and', 'an', 'outspoken', 'gay', 'rights', 'activist'], ['cash', 'strapped', 'financial', 'services', 'group', 'amp', 'has', 'shelved', 'million', 'plan', 'to', 'buy', 'shares', 'back', 'from', 'investors', 'and', 'will', 'raise', 'million', 'in', 'fresh', 'capital', 'after', 'profits', 'crashed', 'in', 'the', 'six', 'months', 'to', 'june', 'chief', 'executive', 'paul', 'batchelor', 'said', 'the', 'result', 'was', 'solid', 'in', 'what', 'he', 'described', 'as', 'the', 'worst', 'conditions', 'for', 'stock', 'markets', 'in', 'years', 'amp', 'half', 'year', 'profit', 'sank', 'per', 'cent', 'to', 'million', 'or', 'share', 'as', 'australia', 'largest', 'investor', 'and', 'fund', 'manager', 'failed', 'to', 'hit', 'projected', 'per', 'cent', 'earnings', 'growth', 'targets', 'and', 'was', 'battered', 'by', 'falling', 'returns', 'on', 'share', 'markets']] @@ -253,10 +270,14 @@ And the testing corpus looks like this: +.. GENERATED FROM PYTHON SOURCE LINES 181-184 + Notice that the testing corpus is just a list of lists and does not contain any tags. +.. GENERATED FROM PYTHON SOURCE LINES 186-202 + Training the Model ------------------ @@ -274,6 +295,7 @@ documents (a few hundred words). Adding training passes can sometimes help with such small datasets. +.. GENERATED FROM PYTHON SOURCE LINES 202-204 .. code-block:: default @@ -283,11 +305,20 @@ with such small datasets. +.. rst-class:: sphx-glr-script-out + + .. code-block:: none + + 2022-12-07 10:59:00,578 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-12-07T10:59:00.540082', 'gensim': '4.2.1.dev0', 'python': '3.8.10 (default, Jun 22 2022, 20:18:18) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-135-generic-x86_64-with-glibc2.29', 'event': 'created'} + +.. GENERATED FROM PYTHON SOURCE LINES 205-206 + Build a vocabulary +.. GENERATED FROM PYTHON SOURCE LINES 206-208 .. code-block:: default @@ -299,24 +330,24 @@ Build a vocabulary .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - 2020-09-30 21:08:55,026 : INFO : collecting all words and their counts - 2020-09-30 21:08:55,027 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags - 2020-09-30 21:08:55,043 : INFO : collected 6981 word types and 300 unique tags from a corpus of 300 examples and 58152 words - 2020-09-30 21:08:55,043 : INFO : Loading a fresh vocabulary - 2020-09-30 21:08:55,064 : INFO : effective_min_count=2 retains 3955 unique words (56% of original 6981, drops 3026) - 2020-09-30 21:08:55,064 : INFO : effective_min_count=2 leaves 55126 word corpus (94% of original 58152, drops 3026) - 2020-09-30 21:08:55,098 : INFO : deleting the raw counts dictionary of 6981 items - 2020-09-30 21:08:55,100 : INFO : sample=0.001 downsamples 46 most-common words - 2020-09-30 21:08:55,100 : INFO : downsampling leaves estimated 42390 word corpus (76.9% of prior 55126) - 2020-09-30 21:08:55,149 : INFO : estimated required memory for 3955 words and 50 dimensions: 3679500 bytes - 2020-09-30 21:08:55,149 : INFO : resetting layer weights + 2022-12-07 10:59:00,806 : INFO : collecting all words and their counts + 2022-12-07 10:59:00,808 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags + 2022-12-07 10:59:00,850 : INFO : collected 6981 word types and 300 unique tags from a corpus of 300 examples and 58152 words + 2022-12-07 10:59:00,850 : INFO : Creating a fresh vocabulary + 2022-12-07 10:59:00,887 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 3955 unique words (56.65% of original 6981, drops 3026)', 'datetime': '2022-12-07T10:59:00.886953', 'gensim': '4.2.1.dev0', 'python': '3.8.10 (default, Jun 22 2022, 20:18:18) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-135-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'} + 2022-12-07 10:59:00,887 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 55126 word corpus (94.80% of original 58152, drops 3026)', 'datetime': '2022-12-07T10:59:00.887466', 'gensim': '4.2.1.dev0', 'python': '3.8.10 (default, Jun 22 2022, 20:18:18) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-135-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'} + 2022-12-07 10:59:00,917 : INFO : deleting the raw counts dictionary of 6981 items + 2022-12-07 10:59:00,918 : INFO : sample=0.001 downsamples 46 most-common words + 2022-12-07 10:59:00,918 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 42390.98914085061 word corpus (76.9%% of prior 55126)', 'datetime': '2022-12-07T10:59:00.918276', 'gensim': '4.2.1.dev0', 'python': '3.8.10 (default, Jun 22 2022, 20:18:18) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-135-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'} + 2022-12-07 10:59:00,965 : INFO : estimated required memory for 3955 words and 50 dimensions: 3679500 bytes + 2022-12-07 10:59:00,965 : INFO : resetting layer weights + +.. GENERATED FROM PYTHON SOURCE LINES 209-214 Essentially, the vocabulary is a list (accessible via ``model.wv.index_to_key``) of all of the unique words extracted from the training corpus. @@ -324,6 +355,7 @@ Additional attributes for each word are available using the ``model.wv.get_vecat For example, to see how many times ``penalty`` appeared in the training corpus: +.. GENERATED FROM PYTHON SOURCE LINES 214-216 .. code-block:: default @@ -335,8 +367,6 @@ For example, to see how many times ``penalty`` appeared in the training corpus: .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none Word 'penalty' appeared 4 times in the training corpus. @@ -344,12 +374,21 @@ For example, to see how many times ``penalty`` appeared in the training corpus: +.. GENERATED FROM PYTHON SOURCE LINES 217-228 + Next, train the model on the corpus. -If optimized Gensim (with BLAS library) is being used, this should take no more than 3 seconds. -If the BLAS library is not being used, this should take no more than 2 -minutes, so use optimized Gensim with BLAS if you value your time. +In the usual case, where Gensim installation found a BLAS library for optimized +bulk vector operations, this training on this tiny 300 document, ~60k word corpus +should take just a few seconds. (More realistic datasets of tens-of-millions +of words or more take proportionately longer.) If for some reason a BLAS library +isn't available, training uses a fallback approach that takes 60x-120x longer, +so even this tiny training will take minutes rather than seconds. (And, in that +case, you should also notice a warning in the logging letting you know there's +something worth fixing.) So, be sure your installation uses the BLAS-optimized +Gensim if you value your time. +.. GENERATED FROM PYTHON SOURCE LINES 228-230 .. code-block:: default @@ -361,181 +400,62 @@ minutes, so use optimized Gensim with BLAS if you value your time. .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - 2020-09-30 21:08:55,553 : INFO : training model with 3 workers on 3955 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 - 2020-09-30 21:08:55,613 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:55,614 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:55,614 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:55,614 : INFO : EPOCH - 1 : training on 58152 raw words (42784 effective words) took 0.1s, 751479 effective words/s - 2020-09-30 21:08:55,664 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:55,666 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:55,666 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:55,666 : INFO : EPOCH - 2 : training on 58152 raw words (42745 effective words) took 0.1s, 845101 effective words/s - 2020-09-30 21:08:55,718 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:55,719 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:55,720 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:55,720 : INFO : EPOCH - 3 : training on 58152 raw words (42605 effective words) took 0.1s, 810845 effective words/s - 2020-09-30 21:08:55,781 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:55,783 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:55,784 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:55,784 : INFO : EPOCH - 4 : training on 58152 raw words (42723 effective words) took 0.1s, 677810 effective words/s - 2020-09-30 21:08:55,846 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:55,847 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:55,848 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:55,848 : INFO : EPOCH - 5 : training on 58152 raw words (42641 effective words) took 0.1s, 682513 effective words/s - 2020-09-30 21:08:55,903 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:55,905 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:55,905 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:55,905 : INFO : EPOCH - 6 : training on 58152 raw words (42654 effective words) took 0.1s, 760381 effective words/s - 2020-09-30 21:08:55,960 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:55,962 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:55,964 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:55,964 : INFO : EPOCH - 7 : training on 58152 raw words (42751 effective words) took 0.1s, 741994 effective words/s - 2020-09-30 21:08:56,018 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,020 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,020 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,020 : INFO : EPOCH - 8 : training on 58152 raw words (42692 effective words) took 0.1s, 773631 effective words/s - 2020-09-30 21:08:56,076 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,078 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,081 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,081 : INFO : EPOCH - 9 : training on 58152 raw words (42745 effective words) took 0.1s, 719453 effective words/s - 2020-09-30 21:08:56,137 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,137 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,137 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,138 : INFO : EPOCH - 10 : training on 58152 raw words (42733 effective words) took 0.1s, 770082 effective words/s - 2020-09-30 21:08:56,195 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,196 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,197 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,197 : INFO : EPOCH - 11 : training on 58152 raw words (42791 effective words) took 0.1s, 734171 effective words/s - 2020-09-30 21:08:56,253 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,255 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,255 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,255 : INFO : EPOCH - 12 : training on 58152 raw words (42773 effective words) took 0.1s, 745248 effective words/s - 2020-09-30 21:08:56,316 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,318 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,318 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,318 : INFO : EPOCH - 13 : training on 58152 raw words (42793 effective words) took 0.1s, 702300 effective words/s - 2020-09-30 21:08:56,369 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,371 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,373 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,373 : INFO : EPOCH - 14 : training on 58152 raw words (42637 effective words) took 0.1s, 802259 effective words/s - 2020-09-30 21:08:56,421 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,425 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,426 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,426 : INFO : EPOCH - 15 : training on 58152 raw words (42686 effective words) took 0.1s, 820787 effective words/s - 2020-09-30 21:08:56,475 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,478 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,479 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,479 : INFO : EPOCH - 16 : training on 58152 raw words (42799 effective words) took 0.1s, 829690 effective words/s - 2020-09-30 21:08:56,530 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,530 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,533 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,534 : INFO : EPOCH - 17 : training on 58152 raw words (42733 effective words) took 0.1s, 794744 effective words/s - 2020-09-30 21:08:56,583 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,585 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,587 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,587 : INFO : EPOCH - 18 : training on 58152 raw words (42703 effective words) took 0.1s, 813146 effective words/s - 2020-09-30 21:08:56,638 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,640 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,640 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,641 : INFO : EPOCH - 19 : training on 58152 raw words (42763 effective words) took 0.1s, 822300 effective words/s - 2020-09-30 21:08:56,696 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,700 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,700 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,700 : INFO : EPOCH - 20 : training on 58152 raw words (42649 effective words) took 0.1s, 733047 effective words/s - 2020-09-30 21:08:56,752 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,753 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,754 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,754 : INFO : EPOCH - 21 : training on 58152 raw words (42701 effective words) took 0.1s, 822006 effective words/s - 2020-09-30 21:08:56,803 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,805 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,805 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,805 : INFO : EPOCH - 22 : training on 58152 raw words (42714 effective words) took 0.1s, 848390 effective words/s - 2020-09-30 21:08:56,857 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,857 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,859 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,860 : INFO : EPOCH - 23 : training on 58152 raw words (42740 effective words) took 0.1s, 811758 effective words/s - 2020-09-30 21:08:56,907 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,909 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,910 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,910 : INFO : EPOCH - 24 : training on 58152 raw words (42754 effective words) took 0.0s, 873741 effective words/s - 2020-09-30 21:08:56,959 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:56,960 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:56,960 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:56,960 : INFO : EPOCH - 25 : training on 58152 raw words (42704 effective words) took 0.0s, 862291 effective words/s - 2020-09-30 21:08:57,009 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,010 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,011 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,011 : INFO : EPOCH - 26 : training on 58152 raw words (42741 effective words) took 0.0s, 868076 effective words/s - 2020-09-30 21:08:57,059 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,062 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,063 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,063 : INFO : EPOCH - 27 : training on 58152 raw words (42610 effective words) took 0.1s, 830699 effective words/s - 2020-09-30 21:08:57,112 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,114 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,115 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,116 : INFO : EPOCH - 28 : training on 58152 raw words (42747 effective words) took 0.1s, 835959 effective words/s - 2020-09-30 21:08:57,164 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,169 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,170 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,170 : INFO : EPOCH - 29 : training on 58152 raw words (42755 effective words) took 0.1s, 804348 effective words/s - 2020-09-30 21:08:57,219 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,222 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,224 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,224 : INFO : EPOCH - 30 : training on 58152 raw words (42760 effective words) took 0.1s, 808636 effective words/s - 2020-09-30 21:08:57,271 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,273 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,273 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,273 : INFO : EPOCH - 31 : training on 58152 raw words (42727 effective words) took 0.0s, 889118 effective words/s - 2020-09-30 21:08:57,323 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,326 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,327 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,327 : INFO : EPOCH - 32 : training on 58152 raw words (42786 effective words) took 0.1s, 819149 effective words/s - 2020-09-30 21:08:57,377 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,378 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,379 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,379 : INFO : EPOCH - 33 : training on 58152 raw words (42614 effective words) took 0.1s, 828217 effective words/s - 2020-09-30 21:08:57,427 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,430 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,431 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,431 : INFO : EPOCH - 34 : training on 58152 raw words (42757 effective words) took 0.1s, 848700 effective words/s - 2020-09-30 21:08:57,476 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,479 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,481 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,481 : INFO : EPOCH - 35 : training on 58152 raw words (42713 effective words) took 0.0s, 881912 effective words/s - 2020-09-30 21:08:57,530 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,530 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,532 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,532 : INFO : EPOCH - 36 : training on 58152 raw words (42632 effective words) took 0.1s, 843930 effective words/s - 2020-09-30 21:08:57,580 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,583 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,584 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,584 : INFO : EPOCH - 37 : training on 58152 raw words (42691 effective words) took 0.1s, 851268 effective words/s - 2020-09-30 21:08:57,632 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,634 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,635 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,635 : INFO : EPOCH - 38 : training on 58152 raw words (42667 effective words) took 0.1s, 850589 effective words/s - 2020-09-30 21:08:57,685 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,686 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,687 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,687 : INFO : EPOCH - 39 : training on 58152 raw words (42641 effective words) took 0.1s, 843857 effective words/s - 2020-09-30 21:08:57,736 : INFO : worker thread finished; awaiting finish of 2 more threads - 2020-09-30 21:08:57,737 : INFO : worker thread finished; awaiting finish of 1 more threads - 2020-09-30 21:08:57,741 : INFO : worker thread finished; awaiting finish of 0 more threads - 2020-09-30 21:08:57,741 : INFO : EPOCH - 40 : training on 58152 raw words (42721 effective words) took 0.1s, 807691 effective words/s - 2020-09-30 21:08:57,741 : INFO : training on a 2326080 raw words (1708575 effective words) took 2.2s, 781245 effective words/s - - - + 2022-12-07 10:59:01,272 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 3955 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-12-07T10:59:01.271863', 'gensim': '4.2.1.dev0', 'python': '3.8.10 (default, Jun 22 2022, 20:18:18) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-135-generic-x86_64-with-glibc2.29', 'event': 'train'} + 2022-12-07 10:59:01,408 : INFO : EPOCH 0: training on 58152 raw words (42665 effective words) took 0.1s, 335294 effective words/s + 2022-12-07 10:59:01,462 : INFO : EPOCH 1: training on 58152 raw words (42755 effective words) took 0.1s, 816420 effective words/s + 2022-12-07 10:59:01,521 : INFO : EPOCH 2: training on 58152 raw words (42692 effective words) took 0.1s, 745004 effective words/s + 2022-12-07 10:59:01,573 : INFO : EPOCH 3: training on 58152 raw words (42670 effective words) took 0.1s, 841368 effective words/s + 2022-12-07 10:59:01,627 : INFO : EPOCH 4: training on 58152 raw words (42685 effective words) took 0.1s, 815442 effective words/s + 2022-12-07 10:59:01,703 : INFO : EPOCH 5: training on 58152 raw words (42709 effective words) took 0.1s, 578402 effective words/s + 2022-12-07 10:59:01,753 : INFO : EPOCH 6: training on 58152 raw words (42594 effective words) took 0.0s, 864899 effective words/s + 2022-12-07 10:59:01,804 : INFO : EPOCH 7: training on 58152 raw words (42721 effective words) took 0.0s, 864073 effective words/s + 2022-12-07 10:59:01,881 : INFO : EPOCH 8: training on 58152 raw words (42622 effective words) took 0.1s, 566867 effective words/s + 2022-12-07 10:59:01,932 : INFO : EPOCH 9: training on 58152 raw words (42770 effective words) took 0.0s, 862066 effective words/s + 2022-12-07 10:59:02,006 : INFO : EPOCH 10: training on 58152 raw words (42739 effective words) took 0.1s, 587035 effective words/s + 2022-12-07 10:59:02,058 : INFO : EPOCH 11: training on 58152 raw words (42612 effective words) took 0.1s, 850879 effective words/s + 2022-12-07 10:59:02,135 : INFO : EPOCH 12: training on 58152 raw words (42655 effective words) took 0.1s, 566216 effective words/s + 2022-12-07 10:59:02,187 : INFO : EPOCH 13: training on 58152 raw words (42749 effective words) took 0.1s, 844125 effective words/s + 2022-12-07 10:59:02,265 : INFO : EPOCH 14: training on 58152 raw words (42748 effective words) took 0.1s, 556136 effective words/s + 2022-12-07 10:59:02,347 : INFO : EPOCH 15: training on 58152 raw words (42748 effective words) took 0.1s, 530528 effective words/s + 2022-12-07 10:59:02,398 : INFO : EPOCH 16: training on 58152 raw words (42737 effective words) took 0.0s, 871200 effective words/s + 2022-12-07 10:59:02,485 : INFO : EPOCH 17: training on 58152 raw words (42697 effective words) took 0.1s, 499981 effective words/s + 2022-12-07 10:59:02,584 : INFO : EPOCH 18: training on 58152 raw words (42747 effective words) took 0.1s, 440730 effective words/s + 2022-12-07 10:59:02,672 : INFO : EPOCH 19: training on 58152 raw words (42739 effective words) took 0.1s, 497651 effective words/s + 2022-12-07 10:59:02,761 : INFO : EPOCH 20: training on 58152 raw words (42782 effective words) took 0.1s, 499103 effective words/s + 2022-12-07 10:59:02,851 : INFO : EPOCH 21: training on 58152 raw words (42580 effective words) took 0.1s, 489515 effective words/s + 2022-12-07 10:59:02,939 : INFO : EPOCH 22: training on 58152 raw words (42687 effective words) took 0.1s, 496560 effective words/s + 2022-12-07 10:59:03,023 : INFO : EPOCH 23: training on 58152 raw words (42667 effective words) took 0.1s, 517527 effective words/s + 2022-12-07 10:59:03,156 : INFO : EPOCH 24: training on 58152 raw words (42678 effective words) took 0.1s, 328575 effective words/s + 2022-12-07 10:59:03,322 : INFO : EPOCH 25: training on 58152 raw words (42743 effective words) took 0.2s, 261440 effective words/s + 2022-12-07 10:59:03,486 : INFO : EPOCH 26: training on 58152 raw words (42692 effective words) took 0.2s, 266564 effective words/s + 2022-12-07 10:59:03,627 : INFO : EPOCH 27: training on 58152 raw words (42774 effective words) took 0.1s, 310530 effective words/s + 2022-12-07 10:59:03,770 : INFO : EPOCH 28: training on 58152 raw words (42706 effective words) took 0.1s, 305665 effective words/s + 2022-12-07 10:59:03,901 : INFO : EPOCH 29: training on 58152 raw words (42658 effective words) took 0.1s, 334228 effective words/s + 2022-12-07 10:59:04,028 : INFO : EPOCH 30: training on 58152 raw words (42746 effective words) took 0.1s, 344379 effective words/s + 2022-12-07 10:59:04,159 : INFO : EPOCH 31: training on 58152 raw words (42676 effective words) took 0.1s, 334291 effective words/s + 2022-12-07 10:59:04,295 : INFO : EPOCH 32: training on 58152 raw words (42763 effective words) took 0.1s, 322886 effective words/s + 2022-12-07 10:59:04,488 : INFO : EPOCH 33: training on 58152 raw words (42647 effective words) took 0.2s, 224522 effective words/s + 2022-12-07 10:59:04,629 : INFO : EPOCH 34: training on 58152 raw words (42720 effective words) took 0.1s, 310616 effective words/s + 2022-12-07 10:59:04,764 : INFO : EPOCH 35: training on 58152 raw words (42775 effective words) took 0.1s, 323299 effective words/s + 2022-12-07 10:59:04,899 : INFO : EPOCH 36: training on 58152 raw words (42662 effective words) took 0.1s, 322458 effective words/s + 2022-12-07 10:59:05,032 : INFO : EPOCH 37: training on 58152 raw words (42656 effective words) took 0.1s, 329126 effective words/s + 2022-12-07 10:59:05,162 : INFO : EPOCH 38: training on 58152 raw words (42720 effective words) took 0.1s, 337238 effective words/s + 2022-12-07 10:59:05,308 : INFO : EPOCH 39: training on 58152 raw words (42688 effective words) took 0.1s, 299620 effective words/s + 2022-12-07 10:59:05,308 : INFO : Doc2Vec lifecycle event {'msg': 'training on 2326080 raw words (1708074 effective words) took 4.0s, 423332 effective words/s', 'datetime': '2022-12-07T10:59:05.308684', 'gensim': '4.2.1.dev0', 'python': '3.8.10 (default, Jun 22 2022, 20:18:18) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-135-generic-x86_64-with-glibc2.29', 'event': 'train'} + + + + +.. GENERATED FROM PYTHON SOURCE LINES 231-235 Now, we can use the trained model to infer a vector for any piece of text by passing a list of words to the ``model.infer_vector`` function. This vector can then be compared with other vectors via cosine similarity. +.. GENERATED FROM PYTHON SOURCE LINES 235-238 .. code-block:: default @@ -548,22 +468,22 @@ vector can then be compared with other vectors via cosine similarity. .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - [-0.08478509 0.05011684 0.0675064 -0.19926868 -0.1235586 0.01768214 - -0.12645927 0.01062329 0.06113973 0.35424358 0.01320948 0.07561274 - -0.01645093 0.0692549 0.08346193 -0.01599065 0.08287009 -0.0139379 - -0.17772709 -0.26271465 0.0442089 -0.04659882 -0.12873884 0.28799203 - -0.13040264 0.12478471 -0.14091878 -0.09698066 -0.07903259 -0.10124907 - -0.28239366 0.13270256 0.04445919 -0.24210942 -0.1907376 -0.07264525 - -0.14167067 -0.22816683 -0.00663796 0.23165748 -0.10436232 -0.01028251 - -0.04064698 0.08813146 0.01072008 -0.149789 0.05923386 0.16301566 - 0.05815683 0.1258063 ] + [-0.10196274 -0.36020595 -0.10973375 0.28432116 -0.00792601 0.01950991 + 0.01309869 0.1045896 -0.2011485 -0.12135196 0.15298457 0.05421316 + -0.06486023 -0.00131951 -0.2237759 -0.08489189 0.05889525 0.27961093 + 0.08121023 -0.06200862 -0.00651888 -0.06831821 0.13001564 0.04539844 + -0.01659351 -0.02359444 -0.22276032 0.06692155 -0.11293832 -0.08056813 + 0.38737044 0.05470002 0.19902836 0.19122775 0.17020799 0.10668964 + 0.01216549 -0.3049222 -0.05198798 0.00130251 0.04994885 -0.0069596 + -0.06367141 -0.11740001 0.14623125 0.10109582 -0.06466878 -0.06512908 + 0.17817481 -0.00934212] + +.. GENERATED FROM PYTHON SOURCE LINES 239-247 Note that ``infer_vector()`` does *not* take a string, but rather a list of string tokens, which should have already been tokenized the same way as the @@ -574,6 +494,8 @@ iterative approximation problem that makes use of internal randomization, repeated inferences of the same text will return slightly different vectors. +.. GENERATED FROM PYTHON SOURCE LINES 249-262 + Assessing the Model ------------------- @@ -588,6 +510,7 @@ Additionally, we'll keep track of the second ranks for a comparison of less similar documents. +.. GENERATED FROM PYTHON SOURCE LINES 262-272 .. code-block:: default @@ -608,10 +531,13 @@ similar documents. +.. GENERATED FROM PYTHON SOURCE LINES 273-276 + Let's count how each document ranks with respect to the training corpus NB. Results vary between runs due to random seeding and very small corpus +.. GENERATED FROM PYTHON SOURCE LINES 276-281 .. code-block:: default @@ -626,8 +552,6 @@ NB. Results vary between runs due to random seeding and very small corpus .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none Counter({0: 292, 1: 8}) @@ -635,6 +559,8 @@ NB. Results vary between runs due to random seeding and very small corpus +.. GENERATED FROM PYTHON SOURCE LINES 282-290 + Basically, greater than 95% of the inferred documents are found to be most similar to itself and about 5% of the time it is mistakenly most similar to another document. Checking the inferred-vector against a @@ -644,6 +570,7 @@ behaving in a usefully consistent manner, though not a real 'accuracy' value. This is great and not entirely surprising. We can take a look at an example: +.. GENERATED FROM PYTHON SOURCE LINES 290-295 .. code-block:: default @@ -658,26 +585,26 @@ This is great and not entirely surprising. We can take a look at an example: .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not very pretty tennis there isn too many consistent bounces you are playing like said bit of classic old grass court rafter said rafter levelled the score after lleyton hewitt shock five set loss to nicholas escude in the first singles rubber but rafter says he felt no added pressure after hewitt defeat knew had good team to back me up even if we were down he said knew could win on the last day know the boys can win doubles so even if we were down still feel we are good enough team to win and vice versa they are good enough team to beat us as well» - SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3): + SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec: - MOST (299, 0.9482713341712952): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not very pretty tennis there isn too many consistent bounces you are playing like said bit of classic old grass court rafter said rafter levelled the score after lleyton hewitt shock five set loss to nicholas escude in the first singles rubber but rafter says he felt no added pressure after hewitt defeat knew had good team to back me up even if we were down he said knew could win on the last day know the boys can win doubles so even if we were down still feel we are good enough team to win and vice versa they are good enough team to beat us as well» + MOST (299, 0.9564058780670166): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not very pretty tennis there isn too many consistent bounces you are playing like said bit of classic old grass court rafter said rafter levelled the score after lleyton hewitt shock five set loss to nicholas escude in the first singles rubber but rafter says he felt no added pressure after hewitt defeat knew had good team to back me up even if we were down he said knew could win on the last day know the boys can win doubles so even if we were down still feel we are good enough team to win and vice versa they are good enough team to beat us as well» - SECOND-MOST (104, 0.8029672503471375): «australian cricket captain steve waugh has supported fast bowler brett lee after criticism of his intimidatory bowling to the south african tailenders in the first test in adelaide earlier this month lee was fined for giving new zealand tailender shane bond an unsportsmanlike send off during the third test in perth waugh says tailenders should not be protected from short pitched bowling these days you re earning big money you ve got responsibility to learn how to bat he said mean there no times like years ago when it was not professional and sort of bowlers code these days you re professional our batsmen work very hard at their batting and expect other tailenders to do likewise meanwhile waugh says his side will need to guard against complacency after convincingly winning the first test by runs waugh says despite the dominance of his side in the first test south africa can never be taken lightly it only one test match out of three or six whichever way you want to look at it so there lot of work to go he said but it nice to win the first battle definitely it gives us lot of confidence going into melbourne you know the big crowd there we love playing in front of the boxing day crowd so that will be to our advantage as well south africa begins four day match against new south wales in sydney on thursday in the lead up to the boxing day test veteran fast bowler allan donald will play in the warm up match and is likely to take his place in the team for the second test south african captain shaun pollock expects much better performance from his side in the melbourne test we still believe that we didn play to our full potential so if we can improve on our aspects the output we put out on the field will be lot better and we still believe we have side that is good enough to beat australia on our day he said» + SECOND-MOST (104, 0.7868924140930176): «australian cricket captain steve waugh has supported fast bowler brett lee after criticism of his intimidatory bowling to the south african tailenders in the first test in adelaide earlier this month lee was fined for giving new zealand tailender shane bond an unsportsmanlike send off during the third test in perth waugh says tailenders should not be protected from short pitched bowling these days you re earning big money you ve got responsibility to learn how to bat he said mean there no times like years ago when it was not professional and sort of bowlers code these days you re professional our batsmen work very hard at their batting and expect other tailenders to do likewise meanwhile waugh says his side will need to guard against complacency after convincingly winning the first test by runs waugh says despite the dominance of his side in the first test south africa can never be taken lightly it only one test match out of three or six whichever way you want to look at it so there lot of work to go he said but it nice to win the first battle definitely it gives us lot of confidence going into melbourne you know the big crowd there we love playing in front of the boxing day crowd so that will be to our advantage as well south africa begins four day match against new south wales in sydney on thursday in the lead up to the boxing day test veteran fast bowler allan donald will play in the warm up match and is likely to take his place in the team for the second test south african captain shaun pollock expects much better performance from his side in the melbourne test we still believe that we didn play to our full potential so if we can improve on our aspects the output we put out on the field will be lot better and we still believe we have side that is good enough to beat australia on our day he said» - MEDIAN (238, 0.2635717988014221): «centrelink is urging people affected by job cuts at regional pay tv operator austar and travel company traveland to seek information about their income support options traveland has announced it is shedding more than jobs around australia and austar is letting employees go centrelink finance information officer peter murray says those facing uncertain futures should head to centrelink in the next few days centrelink is the shopfront now for commonwealth services for income support and the employment network so that it is important if people haven been to us before they might get pleasant surprise at the range of services that we do offer to try and help them through situations where things might have changed for them mr murray said» + MEDIAN (119, 0.24808582663536072): «australia is continuing to negotiate with the united states government in an effort to interview the australian david hicks who was captured fighting alongside taliban forces in afghanistan mr hicks is being held by the united states on board ship in the afghanistan region where the australian federal police and australian security intelligence organisation asio officials are trying to gain access foreign affairs minister alexander downer has also confirmed that the australian government is investigating reports that another australian has been fighting for taliban forces in afghanistan we often get reports of people going to different parts of the world and asking us to investigate them he said we always investigate sometimes it is impossible to find out we just don know in this case but it is not to say that we think there are lot of australians in afghanistan the only case we know is hicks mr downer says it is unclear when mr hicks will be back on australian soil but he is hopeful the americans will facilitate australian authorities interviewing him» - LEAST (243, -0.13247375190258026): «four afghan factions have reached agreement on an interim cabinet during talks in germany the united nations says the administration which will take over from december will be headed by the royalist anti taliban commander hamed karzai it concludes more than week of negotiations outside bonn and is aimed at restoring peace and stability to the war ravaged country the year old former deputy foreign minister who is currently battling the taliban around the southern city of kandahar is an ally of the exiled afghan king mohammed zahir shah he will serve as chairman of an interim authority that will govern afghanistan for six month period before loya jirga or grand traditional assembly of elders in turn appoints an month transitional government meanwhile united states marines are now reported to have been deployed in eastern afghanistan where opposition forces are closing in on al qaeda soldiers reports from the area say there has been gun battle between the opposition and al qaeda close to the tora bora cave complex where osama bin laden is thought to be hiding in the south of the country american marines are taking part in patrols around the air base they have secured near kandahar but are unlikely to take part in any assault on the city however the chairman of the joint chiefs of staff general richard myers says they are prepared for anything they are prepared for engagements they re robust fighting force and they re absolutely ready to engage if that required he said» + LEAST (216, -0.11085141450166702): «senior taliban official confirmed the islamic militia would begin handing over its last bastion of kandahar to pashtun tribal leaders on friday this agreement was that taliban should surrender kandahar peacefully to the elders of these areas and we should guarantee the lives and the safety of taliban authorities and all the taliban from tomorrow should start this program former taliban ambassador to pakistan abdul salam zaeef told cnn in telephone interview he insisted that the taliban would not surrender to hamid karzai the new afghan interim leader and pashtun elder who has been cooperating with the united states to calm unrest among the southern tribes the taliban will surrender to elders not to karzai karzai and other persons which they want to enter kandahar by the support of america they don allow to enter kandahar city he said the taliban will surrender the weapons the ammunition to elders» +.. GENERATED FROM PYTHON SOURCE LINES 296-305 + Notice above that the most similar document (usually the same text) is has a similarity score approaching 1.0. However, the similarity score for the second-ranked documents should be significantly lower (assuming the documents @@ -688,6 +615,7 @@ We can run the next cell repeatedly to see a sampling other target-document comparisons. +.. GENERATED FROM PYTHON SOURCE LINES 305-315 .. code-block:: default @@ -707,17 +635,17 @@ comparisons. .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - Train Document (292): «rival afghan factions are deadlocked over the shape of future government the northern alliance has demanded day adjournment of power sharing talks in germany after its president burhanuddin rabbani objected to the appointment system for an interim administration president rabbani has objected to the plans for an interim government to be drawn up by appointment as discussed in bonn saying the interim leaders should be voted in by afghans themselves he also says there is no real need for sizeable international security force president rabbani says he would prefer local afghan factions drew up their own internal security forces of around personnel but if the world insisted there should be an international security presence there should be no more than or personnel in their security forces he says president rabbani objections are likely to cast doubt on his delegation ability to commit the northern alliance to any course of action decided upon in bonn he now threatens to undermine the very process he claims to support in the quest for stable government in afghanistan» + Train Document (198): «authorities are trying to track down the crew of vessel that landed undetected at cocos islands carrying asylum seekers the group of sri lankan men was found aboard their boat moored to the south of the islands yesterday afternoon shire president ron grant says investigations are underway as to the whereabouts of the crew after the asylum seekers told authorities they had left in another boat after dropping them off unfortunately for them there two aircraft the royal australian air force here at the moment and one getting prepared to fly off and obviously they will be looking to see if there is another boat he said mr grant says the sri lankans have not yet been brought ashore» + + Similar Document (89, 0.7137947082519531): «after the torching of more than buildings over the past three days the situation at the woomera detention centre overnight appeared relatively calm there was however tension inside the south australian facility with up to detainees breaking into prohibited zone the group became problem for staff after breaching fence within the centre at one point staff considered using water cannon to control the detainees it is not known if they actually resorted to any tough action but group of men wearing riot gear possibly star force police officers brought in on standby could be seen in one of the compounds late yesterday government authorities confirmed that two detainees had committed acts of self harm one of them needed stitches and is believed to have been taken away in an ambulance no other details have been released» - Similar Document (13, 0.7867921590805054): «talks between afghan and british officials in kabul have ended without final agreement on the deployment of international security force the lack of suitable translation of the document meant further delay authorities in kabul have been giving conflicting signals for weeks now over the number of peacekeepers they would allow and the role the international force would play the foreign minister dr abdullah appeared to be ending the confusion saying an agreement was about to be signed there is already the agreement so it was finalised he said but spokesman for the interior minister yunis kanooni emerged soon after to say there was no agreement and nothing to sign scores of british peacekeepers are already patrolling the streets of kabul in tandem with afghan police but proposals to enlarge the force to as many as international peacekeepers have been criticised by some commanders as tantamount to foreign occupation» +.. GENERATED FROM PYTHON SOURCE LINES 316-322 Testing the Model ----------------- @@ -726,6 +654,7 @@ Using the same approach above, we'll infer the vector for a randomly chosen test document, and compare the document to our model by eye. +.. GENERATED FROM PYTHON SOURCE LINES 322-334 .. code-block:: default @@ -747,23 +676,23 @@ test document, and compare the document to our model by eye. .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - Test Document (49): «labor needed to distinguish itself from the government on the issue of asylum seekers greens leader bob brown has said his senate colleague kerry nettle intends to move motion today on the first anniversary of the tampa crisis condemning the government over its refugee policy and calling for an end to mandatory detention we greens want to bring the government to book over its serial breach of international obligations as far as asylum seekers in this country are concerned senator brown said today» + Test Document (17): «the united nations world food program estimates that up to million people in seven countries malawi mozambique zambia angola swaziland lesotho and zimbabwe face death by starvation unless there is massive international response in malawi as many as people may have already died the signs of malnutrition swollen stomachs stick thin arms light coloured hair are everywhere» + + SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec: - SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3): + MOST (86, 0.8239533305168152): «argentina economy minister domingo cavallo is reported to have resigned in the face of mounting unrest over the country crumbling economy the reports in number of local media outlets could not be officially confirmed the news comes as police used teargas to disperse tens of thousands of people who had massed near the presidential palace in buenos aires and in other parts of the city to protest against the declaration of state of emergency it was declared after mounting popular discontent and widespread looting in the past few days with people over the state of the economy which has been in recession for four years» - MOST (218, 0.8016394376754761): «refugee support groups are strongly critical of federal government claims that the pacific solution program is working well the immigration minister philip ruddock says he is pleased with the program which uses pacific island nations to process asylum seekers wanting to come to australia president of the hazara ethnic society of australia hassan ghulam says the australian government is bullying smaller nations into accepting asylum seekers if the pacific countries wanted refugees they can clearly raise their voice in the united nations and say yes we are accepting refugees and why australia who gives this authority to the australian government to force the pacific countries to accept refugees in this form or in the other form he asked» + MEDIAN (221, 0.40627941489219666): «reserve bank governor ian macfarlane says he is confident australia will ride through the current world economic slump largely brought on by the united states mr macfarlane told gathering in sydney last night australia growth is remarkably good by world standards and inflation should come down in the next months he predicts the united states economy will show signs of recovery from mid year and that as result it is highly unlikely that the reserve bank will raise interest rates in the next six months calendar year has been difficult one for the world economy and the first half of looks like remaining weak before recovery gets underway therefore this period will be classified as world recession like those of the mid the early and the early mr macfarlane said the australian economy has got through the first half of it in reasonably good shape» - MEDIAN (204, 0.3319269120693207): «an iraqi doctor being held at sydney villawood detention centre claims he was prevented from receiving human rights award dr aamer sultan had been awarded special commendation at yesterday human rights and equal opportunity commission awards in sydney but was not able to receive the honour in person dr sultan says he had been hoping to attend the ceremony but says the management at villawood stopped him from going submitted formal request to the centre manager who promised me that he will present the matter to migration management here who are the main authority here they also came back that unfortunately we can not fulfill this request for you but they didn give any explanation dr sultan says he was disappointed by the decision the immigration minister philip ruddock has written letter of complaint to the medical journal of australia about an article penned by dr sultan on the psychological state of detainees at villawood the journal has published research dr sultan conducted with former visiting psychologist to the centre kevin sullivan their survey of detainees over nine months found all but one displayed symptoms of psychological distress at some time the article says per cent acknowledged chronic depressive symptoms and close to half of the group had reached severe stages of depression» + LEAST (37, -0.06813289225101471): «australia quicks and opening batsmen have put the side in dominant position going into day three of the boxing day test match against south africa at the mcg australia is no wicket for only runs shy of south africa after andy bichel earlier starred as the tourists fell for when play was abandoned due to rain few overs short of scheduled stumps yesterday justin langer was not out and matthew hayden the openers went on the attack from the start with langer innings including six fours and hayden eight earlier shaun pollock and nantie haywood launched vital rearguard action to help south africa to respectable first innings total the pair put on runs for the final wicket to help the tourists to the south africans had slumped to for through combination of australia good bowling good fielding and good luck after resuming at for yesterday morning the tourists looked to be cruising as jacques kallis and neil mckenzie added without loss but then bichel suddenly had them reeling after snatching two wickets in two balls first he had jacques kallis caught behind for although kallis could consider himself very unlucky as replays showed his bat was long way from the ball on the next ball bichel snatched sharp return catch to dismiss lance klusener first ball and have shot at hat trick bichel missed out on the hat trick and mark boucher and neil mckenzie again steadied the south african innings adding before the introduction of part timer mark waugh to the attack paid off for australia waugh removed boucher for caught by bichel brett lee then chipped in trapping mckenzie leg before for with perfect inswinger bichel continued his good day in the field running out claude henderson for with direct hit from the in field lee roared in to allan donald bouncing him and then catching the edge with rising delivery which ricky ponting happily swallowed at third slip to remove the returning paceman for duck bichel did not get his hat trick but ended with the best figures of the australian bowlers after also picking up the final wicket of nantie haywood for lee took for and glenn mcgrath for» - LEAST (157, -0.10524928569793701): «british man has been found guilty by unanimous verdict of the kidnap and murder of an eight year old schoolgirl whose death in july shocked britain and set off rampage of anti paedophile vigilantes roy whiting was sentenced to life imprisonment for the abduction and murder of eight year old sarah payne with recommendation by trial judge justice richard curtis that he never be released you are indeed an evil man you are in no way mentally unwell have seen you for month and in my view you are glib and cunning liar justice curtis said there were cheers of delight as the verdicts were read out by the foreman at lewes crown court the jury of nine men and three women had been deliberating for nine hours as soon as the verdicts were declared the court heard details of whiting previous conviction for the kidnap and indecent assault of nine year old girl in prosecutor timothy langdale told the jury how the defendant threw the child into the back of his dirty red ford sierra and locked the doors he had driven her somewhere she didn know where when she asked where they were going he said shut up because he had knife mr langdale said the defendant told the girl to take off her clothes when she refused he produced rope from his pocket and threatened to tie her up what he actually threatened was that he would tie her mouth up she took her clothes off as he had ordered her to do mr langdale then gave graphic details of the abuse to which whiting subjected the terrified child whiting was given four year jail sentence in june after admitting carrying out the attack in march that year but he was released in november despite warnings from probation officers who were convinced there was danger he would attack another child they set out their warnings in pre sentence report prepared after the first assault and in the parole report before he was released from prison he was kept under supervision for four months after his release but was not being monitored by july last year when eight year old sarah was abducted and killed whiting has been arrested three times in connection with the case but the first and second times was released without being charged sarah disappeared on july last year prompting massive police search her partially buried naked body was found days later in field and police believe she was strangled or suffocated» +.. GENERATED FROM PYTHON SOURCE LINES 335-360 Conclusion ---------- @@ -794,30 +723,25 @@ If you'd like to know more about the subject matter of this tutorial, check out .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 0 minutes 7.863 seconds) + **Total running time of the script:** ( 0 minutes 16.509 seconds) -**Estimated memory usage:** 37 MB +**Estimated memory usage:** 48 MB .. _sphx_glr_download_auto_examples_tutorials_run_doc2vec_lee.py: +.. only:: html -.. only :: html - - .. container:: sphx-glr-footer - :class: sphx-glr-footer-example - - - - .. container:: sphx-glr-download sphx-glr-download-python + .. container:: sphx-glr-footer sphx-glr-footer-example - :download:`Download Python source code: run_doc2vec_lee.py ` + .. container:: sphx-glr-download sphx-glr-download-python + :download:`Download Python source code: run_doc2vec_lee.py ` - .. container:: sphx-glr-download sphx-glr-download-jupyter + .. container:: sphx-glr-download sphx-glr-download-jupyter - :download:`Download Jupyter notebook: run_doc2vec_lee.ipynb ` + :download:`Download Jupyter notebook: run_doc2vec_lee.ipynb ` .. only:: html diff --git a/docs/src/auto_examples/tutorials/run_fasttext.ipynb b/docs/src/auto_examples/tutorials/run_fasttext.ipynb index 6506e1465b..0170d7fc78 100644 --- a/docs/src/auto_examples/tutorials/run_fasttext.ipynb +++ b/docs/src/auto_examples/tutorials/run_fasttext.ipynb @@ -15,7 +15,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\nFastText Model\n==============\n\nIntroduces Gensim's fastText model and demonstrates its use on the Lee Corpus.\n\n" + "\n# FastText Model\n\nIntroduces Gensim's fastText model and demonstrates its use on the Lee Corpus.\n" ] }, { @@ -40,14 +40,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When to use fastText?\n---------------------\n\nThe main principle behind `fastText `_ is that the\nmorphological structure of a word carries important information about the meaning of the word.\nSuch structure is not taken into account by traditional word embeddings like Word2Vec, which\ntrain a unique word embedding for every individual word.\nThis is especially significant for morphologically rich languages (German, Turkish) in which a\nsingle word can have a large number of morphological forms, each of which might occur rarely,\nthus making it hard to train good word embeddings.\n\n\nfastText attempts to solve this by treating each word as the aggregation of its subwords.\nFor the sake of simplicity and language-independence, subwords are taken to be the character ngrams\nof the word. The vector for a word is simply taken to be the sum of all vectors of its component char-ngrams.\n\n\nAccording to a detailed comparison of Word2Vec and fastText in\n`this notebook `__,\nfastText does significantly better on syntactic tasks as compared to the original Word2Vec,\nespecially when the size of the training corpus is small. Word2Vec slightly outperforms fastText\non semantic tasks though. The differences grow smaller as the size of the training corpus increases.\n\n\nfastText can obtain vectors even for out-of-vocabulary (OOV) words, by summing up vectors for its\ncomponent char-ngrams, provided at least one of the char-ngrams was present in the training data.\n\n\n" + "## When to use fastText?\n\nThe main principle behind [fastText](https://github.com/facebookresearch/fastText) is that the\nmorphological structure of a word carries important information about the meaning of the word.\nSuch structure is not taken into account by traditional word embeddings like Word2Vec, which\ntrain a unique word embedding for every individual word.\nThis is especially significant for morphologically rich languages (German, Turkish) in which a\nsingle word can have a large number of morphological forms, each of which might occur rarely,\nthus making it hard to train good word embeddings.\n\n\nfastText attempts to solve this by treating each word as the aggregation of its subwords.\nFor the sake of simplicity and language-independence, subwords are taken to be the character ngrams\nof the word. The vector for a word is simply taken to be the sum of all vectors of its component char-ngrams.\n\n\nAccording to a detailed comparison of Word2Vec and fastText in\n[this notebook](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Word2Vec_FastText_Comparison.ipynb)_,\nfastText does significantly better on syntactic tasks as compared to the original Word2Vec,\nespecially when the size of the training corpus is small. Word2Vec slightly outperforms fastText\non semantic tasks though. The differences grow smaller as the size of the training corpus increases.\n\n\nfastText can obtain vectors even for out-of-vocabulary (OOV) words, by summing up vectors for its\ncomponent char-ngrams, provided at least one of the char-ngrams was present in the training data.\n\n\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Training models\n---------------\n\n\n" + "## Training models\n\n\n" ] }, { @@ -72,14 +72,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Training hyperparameters\n^^^^^^^^^^^^^^^^^^^^^^^^\n\n\n" + "### Training hyperparameters\n\n\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Hyperparameters for training the model follow the same pattern as Word2Vec. FastText supports the following parameters from the original word2vec:\n\n- model: Training architecture. Allowed values: `cbow`, `skipgram` (Default `cbow`)\n- vector_size: Dimensionality of vector embeddings to be learnt (Default 100)\n- alpha: Initial learning rate (Default 0.025)\n- window: Context window size (Default 5)\n- min_count: Ignore words with number of occurrences below this (Default 5)\n- loss: Training objective. Allowed values: `ns`, `hs`, `softmax` (Default `ns`)\n- sample: Threshold for downsampling higher-frequency words (Default 0.001)\n- negative: Number of negative words to sample, for `ns` (Default 5)\n- epochs: Number of epochs (Default 5)\n- sorted_vocab: Sort vocab by descending frequency (Default 1)\n- threads: Number of threads to use (Default 12)\n\n\nIn addition, fastText has three additional parameters:\n\n- min_n: min length of char ngrams (Default 3)\n- max_n: max length of char ngrams (Default 6)\n- bucket: number of buckets used for hashing ngrams (Default 2000000)\n\n\nParameters ``min_n`` and ``max_n`` control the lengths of character ngrams that each word is broken down into while training and looking up embeddings. If ``max_n`` is set to 0, or to be lesser than ``min_n``\\ , no character ngrams are used, and the model effectively reduces to Word2Vec.\n\n\n\nTo bound the memory requirements of the model being trained, a hashing function is used that maps ngrams to integers in 1 to K. For hashing these character sequences, the `Fowler-Noll-Vo hashing function `_ (FNV-1a variant) is employed.\n\n\n" + "Hyperparameters for training the model follow the same pattern as Word2Vec. FastText supports the following parameters from the original word2vec:\n\n- model: Training architecture. Allowed values: `cbow`, `skipgram` (Default `cbow`)\n- vector_size: Dimensionality of vector embeddings to be learnt (Default 100)\n- alpha: Initial learning rate (Default 0.025)\n- window: Context window size (Default 5)\n- min_count: Ignore words with number of occurrences below this (Default 5)\n- loss: Training objective. Allowed values: `ns`, `hs`, `softmax` (Default `ns`)\n- sample: Threshold for downsampling higher-frequency words (Default 0.001)\n- negative: Number of negative words to sample, for `ns` (Default 5)\n- epochs: Number of epochs (Default 5)\n- sorted_vocab: Sort vocab by descending frequency (Default 1)\n- threads: Number of threads to use (Default 12)\n\n\nIn addition, fastText has three additional parameters:\n\n- min_n: min length of char ngrams (Default 3)\n- max_n: max length of char ngrams (Default 6)\n- bucket: number of buckets used for hashing ngrams (Default 2000000)\n\n\nParameters ``min_n`` and ``max_n`` control the lengths of character ngrams that each word is broken down into while training and looking up embeddings. If ``max_n`` is set to 0, or to be lesser than ``min_n``\\ , no character ngrams are used, and the model effectively reduces to Word2Vec.\n\n\n\nTo bound the memory requirements of the model being trained, a hashing function is used that maps ngrams to integers in 1 to K. For hashing these character sequences, the [Fowler-Noll-Vo hashing function](http://www.isthe.com/chongo/tech/comp/fnv) (FNV-1a variant) is employed.\n\n\n" ] }, { @@ -93,7 +93,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Saving/loading models\n---------------------\n\n\n" + "## Saving/loading models\n\n\n" ] }, { @@ -125,7 +125,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Word vector lookup\n------------------\n\n\nAll information necessary for looking up fastText words (incl. OOV words) is\ncontained in its ``model.wv`` attribute.\n\nIf you don't need to continue training your model, you can export & save this `.wv`\nattribute and discard `model`, to save space and RAM.\n\n\n" + "## Word vector lookup\n\n\nAll information necessary for looking up fastText words (incl. OOV words) is\ncontained in its ``model.wv`` attribute.\n\nIf you don't need to continue training your model, you can export & save this `.wv`\nattribute and discard `model`, to save space and RAM.\n\n\n" ] }, { @@ -176,7 +176,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Similarity operations\n---------------------\n\n\n" + "## Similarity operations\n\n\n" ] }, { @@ -223,14 +223,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Syntactically similar words generally have high similarity in fastText models, since a large number of the component char-ngrams will be the same. As a result, fastText generally does better at syntactic tasks than Word2Vec. A detailed comparison is provided `here `_.\n\n\n" + "Syntactically similar words generally have high similarity in fastText models, since a large number of the component char-ngrams will be the same. As a result, fastText generally does better at syntactic tasks than Word2Vec. A detailed comparison is provided [here](Word2Vec_FastText_Comparison.ipynb).\n\n\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Other similarity operations\n^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nThe example training corpus is a toy corpus, results are not expected to be good, for proof-of-concept only\n\n" + "### Other similarity operations\n\nThe example training corpus is a toy corpus, results are not expected to be good, for proof-of-concept only\n\n" ] }, { @@ -292,7 +292,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Word Movers distance\n^^^^^^^^^^^^^^^^^^^^\n\nYou'll need the optional ``pyemd`` library for this section, ``pip install pyemd``.\n\nLet's start with two sentences:\n\n" + "### Word Movers distance\n\nYou'll need the optional ``POT`` library for this section, ``pip install POT``.\n\nLet's start with two sentences:\n\n" ] }, { @@ -377,7 +377,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/docs/src/auto_examples/tutorials/run_fasttext.py b/docs/src/auto_examples/tutorials/run_fasttext.py index 5a03b5d35e..ac3a1bc4c3 100644 --- a/docs/src/auto_examples/tutorials/run_fasttext.py +++ b/docs/src/auto_examples/tutorials/run_fasttext.py @@ -248,7 +248,7 @@ # Word Movers distance # ^^^^^^^^^^^^^^^^^^^^ # -# You'll need the optional ``pyemd`` library for this section, ``pip install pyemd``. +# You'll need the optional ``POT`` library for this section, ``pip install POT``. # # Let's start with two sentences: sentence_obama = 'Obama speaks to the media in Illinois'.lower().split() diff --git a/docs/src/auto_examples/tutorials/run_fasttext.py.md5 b/docs/src/auto_examples/tutorials/run_fasttext.py.md5 index 68b4e4ced3..e227559329 100644 --- a/docs/src/auto_examples/tutorials/run_fasttext.py.md5 +++ b/docs/src/auto_examples/tutorials/run_fasttext.py.md5 @@ -1 +1 @@ -afc57676cfaca4793066a78efb2996f7 \ No newline at end of file +5f5ac745a06ff512074def4c0eb15f79 \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_fasttext.rst b/docs/src/auto_examples/tutorials/run_fasttext.rst index 6d163b7910..539cab39e7 100644 --- a/docs/src/auto_examples/tutorials/run_fasttext.rst +++ b/docs/src/auto_examples/tutorials/run_fasttext.rst @@ -1,12 +1,21 @@ + +.. DO NOT EDIT. +.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. +.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: +.. "auto_examples/tutorials/run_fasttext.py" +.. LINE NUMBERS ARE GIVEN BELOW. + .. only:: html .. note:: :class: sphx-glr-download-link-note - Click :ref:`here ` to download the full example code - .. rst-class:: sphx-glr-example-title + Click :ref:`here ` + to download the full example code - .. _sphx_glr_auto_examples_tutorials_run_fasttext.py: +.. rst-class:: sphx-glr-example-title + +.. _sphx_glr_auto_examples_tutorials_run_fasttext.py: FastText Model @@ -14,6 +23,7 @@ FastText Model Introduces Gensim's fastText model and demonstrates its use on the Lee Corpus. +.. GENERATED FROM PYTHON SOURCE LINES 7-11 .. code-block:: default @@ -28,10 +38,14 @@ Introduces Gensim's fastText model and demonstrates its use on the Lee Corpus. +.. GENERATED FROM PYTHON SOURCE LINES 12-15 + Here, we'll learn to work with fastText library for training word-embedding models, saving & loading them and performing similarity operations & vector lookups analogous to Word2Vec. +.. GENERATED FROM PYTHON SOURCE LINES 18-45 + When to use fastText? --------------------- @@ -60,15 +74,20 @@ fastText can obtain vectors even for out-of-vocabulary (OOV) words, by summing u component char-ngrams, provided at least one of the char-ngrams was present in the training data. +.. GENERATED FROM PYTHON SOURCE LINES 49-52 + Training models --------------- +.. GENERATED FROM PYTHON SOURCE LINES 56-60 + For the following examples, we'll use the Lee Corpus (which you already have if you've installed Gensim) for training our model. +.. GENERATED FROM PYTHON SOURCE LINES 61-82 .. code-block:: default @@ -99,19 +118,44 @@ For the following examples, we'll use the Lee Corpus (which you already have if .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - - - - + 2022-10-23 11:05:20,779 : INFO : adding document #0 to Dictionary<0 unique tokens: []> + 2022-10-23 11:05:20,779 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions) + 2022-10-23 11:05:20,782 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2022-10-23T11:05:20.780094', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'created'} + 2022-10-23 11:05:20,858 : INFO : FastText lifecycle event {'params': 'FastText', 'datetime': '2022-10-23T11:05:20.858457', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'created'} + 2022-10-23 11:05:20,858 : INFO : collecting all words and their counts + 2022-10-23 11:05:20,858 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types + 2022-10-23 11:05:20,874 : INFO : collected 10781 word types from a corpus of 59890 raw words and 300 sentences + 2022-10-23 11:05:20,874 : INFO : Creating a fresh vocabulary + 2022-10-23 11:05:20,882 : INFO : FastText lifecycle event {'msg': 'effective_min_count=5 retains 1762 unique words (16.34% of original 10781, drops 9019)', 'datetime': '2022-10-23T11:05:20.882842', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'prepare_vocab'} + 2022-10-23 11:05:20,882 : INFO : FastText lifecycle event {'msg': 'effective_min_count=5 leaves 46084 word corpus (76.95% of original 59890, drops 13806)', 'datetime': '2022-10-23T11:05:20.882944', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'prepare_vocab'} + 2022-10-23 11:05:20,892 : INFO : deleting the raw counts dictionary of 10781 items + 2022-10-23 11:05:20,892 : INFO : sample=0.001 downsamples 45 most-common words + 2022-10-23 11:05:20,893 : INFO : FastText lifecycle event {'msg': 'downsampling leaves estimated 32610.61883565215 word corpus (70.8%% of prior 46084)', 'datetime': '2022-10-23T11:05:20.893011', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'prepare_vocab'} + 2022-10-23 11:05:20,927 : INFO : estimated required memory for 1762 words, 2000000 buckets and 100 dimensions: 802597824 bytes + 2022-10-23 11:05:20,927 : INFO : resetting layer weights + 2022-10-23 11:05:22,169 : INFO : FastText lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2022-10-23T11:05:22.169699', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'build_vocab'} + 2022-10-23 11:05:22,169 : INFO : FastText lifecycle event {'msg': 'training model with 3 workers on 1762 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-10-23T11:05:22.169966', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'train'} + 2022-10-23 11:05:22,273 : INFO : EPOCH 0: training on 60387 raw words (32958 effective words) took 0.1s, 355842 effective words/s + 2022-10-23 11:05:22,369 : INFO : EPOCH 1: training on 60387 raw words (32906 effective words) took 0.1s, 369792 effective words/s + 2022-10-23 11:05:22,466 : INFO : EPOCH 2: training on 60387 raw words (32863 effective words) took 0.1s, 361340 effective words/s + 2022-10-23 11:05:22,563 : INFO : EPOCH 3: training on 60387 raw words (32832 effective words) took 0.1s, 363904 effective words/s + 2022-10-23 11:05:22,662 : INFO : EPOCH 4: training on 60387 raw words (32827 effective words) took 0.1s, 355536 effective words/s + 2022-10-23 11:05:22,662 : INFO : FastText lifecycle event {'msg': 'training on 301935 raw words (164386 effective words) took 0.5s, 333704 effective words/s', 'datetime': '2022-10-23T11:05:22.662680', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'train'} + + + + + +.. GENERATED FROM PYTHON SOURCE LINES 83-86 Training hyperparameters ^^^^^^^^^^^^^^^^^^^^^^^^ +.. GENERATED FROM PYTHON SOURCE LINES 90-118 + Hyperparameters for training the model follow the same pattern as Word2Vec. FastText supports the following parameters from the original word2vec: - model: Training architecture. Allowed values: `cbow`, `skipgram` (Default `cbow`) @@ -141,17 +185,24 @@ Parameters ``min_n`` and ``max_n`` control the lengths of character ngrams that To bound the memory requirements of the model being trained, a hashing function is used that maps ngrams to integers in 1 to K. For hashing these character sequences, the `Fowler-Noll-Vo hashing function `_ (FNV-1a variant) is employed. +.. GENERATED FROM PYTHON SOURCE LINES 122-124 + **Note:** You can continue to train your model while using Gensim's native implementation of fastText. +.. GENERATED FROM PYTHON SOURCE LINES 128-131 + Saving/loading models --------------------- +.. GENERATED FROM PYTHON SOURCE LINES 135-138 + Models can be saved and loaded via the ``load`` and ``save`` methods, just like any other model in Gensim. +.. GENERATED FROM PYTHON SOURCE LINES 139-153 .. code-block:: default @@ -175,20 +226,35 @@ any other model in Gensim. .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - + 2022-10-23 11:05:22,826 : INFO : FastText lifecycle event {'fname_or_handle': '/tmp/saved_model_gensim-grsw1xyt', 'separately': '[]', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-10-23T11:05:22.826086', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'saving'} + 2022-10-23 11:05:22,827 : INFO : storing np array 'vectors_ngrams' to /tmp/saved_model_gensim-grsw1xyt.wv.vectors_ngrams.npy + 2022-10-23 11:05:24,259 : INFO : not storing attribute vectors + 2022-10-23 11:05:24,259 : INFO : not storing attribute buckets_word + 2022-10-23 11:05:24,260 : INFO : not storing attribute cum_table + 2022-10-23 11:05:24,289 : INFO : saved /tmp/saved_model_gensim-grsw1xyt + 2022-10-23 11:05:24,289 : INFO : loading FastText object from /tmp/saved_model_gensim-grsw1xyt + 2022-10-23 11:05:24,292 : INFO : loading wv recursively from /tmp/saved_model_gensim-grsw1xyt.wv.* with mmap=None + 2022-10-23 11:05:24,292 : INFO : loading vectors_ngrams from /tmp/saved_model_gensim-grsw1xyt.wv.vectors_ngrams.npy with mmap=None + 2022-10-23 11:05:24,594 : INFO : setting ignored attribute vectors to None + 2022-10-23 11:05:24,594 : INFO : setting ignored attribute buckets_word to None + 2022-10-23 11:05:24,673 : INFO : setting ignored attribute cum_table to None + 2022-10-23 11:05:24,689 : INFO : FastText lifecycle event {'fname': '/tmp/saved_model_gensim-grsw1xyt', 'datetime': '2022-10-23T11:05:24.689800', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'loaded'} + + +.. GENERATED FROM PYTHON SOURCE LINES 154-158 The ``save_word2vec_format`` is also available for fastText models, but will cause all vectors for ngrams to be lost. As a result, a model loaded in this way will behave as a regular word2vec model. +.. GENERATED FROM PYTHON SOURCE LINES 162-172 + Word vector lookup ------------------ @@ -200,6 +266,7 @@ If you don't need to continue training your model, you can export & save this `. attribute and discard `model`, to save space and RAM. +.. GENERATED FROM PYTHON SOURCE LINES 173-181 .. code-block:: default @@ -217,16 +284,15 @@ attribute and discard `model`, to save space and RAM. .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - + True +.. GENERATED FROM PYTHON SOURCE LINES 183-185 .. code-block:: default @@ -238,8 +304,6 @@ attribute and discard `model`, to save space and RAM. .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none False @@ -247,6 +311,7 @@ attribute and discard `model`, to save space and RAM. +.. GENERATED FROM PYTHON SOURCE LINES 187-189 .. code-block:: default @@ -258,35 +323,34 @@ attribute and discard `model`, to save space and RAM. .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - array([ 0.12453239, -0.26018462, -0.04087191, 0.2563215 , 0.31401935, - 0.16155584, 0.39527607, 0.27404118, -0.45236284, 0.06942682, - 0.36584955, 0.51162827, -0.51161295, -0.192019 , -0.5068029 , - -0.07426998, -0.6276584 , 0.22271585, 0.19990133, 0.2582401 , - 0.14329399, -0.01959469, -0.45576197, -0.06447829, 0.1493489 , - 0.17261286, -0.13472046, 0.26546794, -0.34596932, 0.5626187 , - -0.7038802 , 0.15603925, -0.03104019, -0.06228801, -0.13480644, - -0.0684596 , 0.24728075, 0.55081636, 0.07330963, 0.32814154, - 0.1574982 , 0.56742406, -0.31233737, 0.14195296, 0.0540203 , - 0.01718009, 0.05519052, -0.04002226, 0.16157456, -0.5134223 , - -0.01033936, 0.05745083, -0.39208183, 0.52553374, -1.0542839 , - 0.2145304 , -0.15234643, -0.35197273, -0.6215585 , 0.01796502, - 0.21242104, 0.30762967, 0.2787644 , -0.19908747, 0.7144409 , - 0.45586124, -0.21344525, 0.26920903, -0.651759 , -0.37096855, - -0.16243419, -0.3085725 , -0.70485127, -0.04926324, -0.80278563, - -0.24352737, 0.6427129 , -0.3530421 , -0.29960123, 0.01466726, - -0.18253349, -0.2489397 , 0.00648343, 0.18057272, -0.11812428, - -0.49044088, 0.1847386 , -0.27946883, 0.3941279 , -0.39211616, - 0.26847798, 0.41468227, -0.3953728 , -0.25371104, 0.3390468 , - -0.16447693, -0.18722224, 0.2782088 , -0.0696249 , 0.4313547 ], + array([-0.19996722, 0.1813906 , -0.2631422 , -0.09450997, 0.0605551 , + 0.38595745, 0.30778143, 0.5067505 , 0.23698695, -0.23913051, + 0.02506454, -0.15320891, -0.2434152 , 0.52560467, -0.38980618, + -0.55800015, 0.19291814, -0.23110117, -0.43341738, -0.53108984, + -0.4688596 , -0.04782811, -0.46767992, -0.1137548 , -0.20153292, + -0.31324366, -0.6708753 , -0.10945056, -0.31843412, 0.26011363, + -0.32820454, 0.32238692, 0.8404276 , -0.2502807 , 0.19792764, + 0.37759355, 0.40180317, -0.09189364, -0.36985794, -0.33649284, + 0.46887243, -0.43174997, 0.04100857, -0.39025533, -0.51651365, + -0.32087606, -0.05997978, 0.14294061, 0.360094 , -0.02155857, + 0.37047735, -0.44327876, 0.28450134, -0.4054028 , -0.19731535, + -0.21376207, -0.1685454 , -0.12901361, 0.03528974, -0.35231775, + -0.35454988, -0.43326724, -0.21185161, 0.3519939 , -0.11108 , + 0.69391364, 0.05785353, 0.05663215, 0.42399758, 0.24977471, + -0.24918619, 0.3934391 , 0.5109367 , -0.6553013 , 0.33610865, + -0.09825795, 0.25878346, -0.03377685, 0.06902322, 0.37547323, + 0.17450804, -0.5030028 , -0.82190335, -0.15457787, -0.12070727, + -0.78729135, 0.49075758, 0.19234893, -0.01774574, -0.28116694, + -0.02472195, 0.40292844, -0.14185381, 0.07625303, -0.20744859, + 0.59728205, -0.2217386 , -0.29148448, -0.01873052, -0.2401561 ], dtype=float32) +.. GENERATED FROM PYTHON SOURCE LINES 191-194 .. code-block:: default @@ -299,42 +363,45 @@ attribute and discard `model`, to save space and RAM. .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - array([ 0.10586783, -0.22489995, -0.03636307, 0.22263278, 0.27037606, - 0.1394871 , 0.3411114 , 0.2369042 , -0.38989475, 0.05935 , - 0.31713557, 0.44301754, -0.44249156, -0.16652377, -0.4388366 , - -0.06266895, -0.5436303 , 0.19294666, 0.17363031, 0.22459263, - 0.12532061, -0.01866964, -0.3936521 , -0.05507145, 0.12905194, - 0.14942174, -0.11657442, 0.22935589, -0.29934618, 0.4859668 , - -0.6073519 , 0.13433163, -0.02491274, -0.05468523, -0.11884545, - -0.06117092, 0.21444008, 0.4775469 , 0.06227469, 0.28350767, - 0.13580805, 0.48993143, -0.27067345, 0.1252003 , 0.04606731, - 0.01598426, 0.04640368, -0.03456376, 0.14138013, -0.44429192, - -0.00865329, 0.05027836, -0.341311 , 0.45402458, -0.91097856, - 0.1868968 , -0.13116683, -0.30361563, -0.5364188 , 0.01603454, - 0.18146741, 0.26708448, 0.24074472, -0.17163375, 0.61906886, - 0.39530373, -0.18259627, 0.23319626, -0.5634787 , -0.31959867, - -0.13945322, -0.269441 , -0.60941464, -0.0403638 , -0.69563633, - -0.2098089 , 0.5569868 , -0.30320194, -0.25840232, 0.01436759, - -0.15632603, -0.21624804, 0.00434287, 0.15566474, -0.10228094, - -0.4249678 , 0.16197811, -0.24147548, 0.34205705, -0.3391568 , - 0.23235887, 0.35860622, -0.34247142, -0.21777524, 0.29318404, - -0.1407287 , -0.16115218, 0.24247572, -0.06217333, 0.37221798], + array([-0.17333212, 0.15747589, -0.22726758, -0.08140025, 0.05103909, + 0.33196837, 0.2670658 , 0.43939307, 0.205082 , -0.20810795, + 0.02336278, -0.13075203, -0.21126968, 0.45168898, -0.33789524, + -0.48235178, 0.16582203, -0.19900155, -0.3727986 , -0.4591713 , + -0.401847 , -0.04239817, -0.40366223, -0.09961417, -0.17264459, + -0.26896393, -0.57774097, -0.09225026, -0.27459562, 0.22605109, + -0.28136173, 0.27779424, 0.72365224, -0.21562205, 0.17094932, + 0.3253317 , 0.34816158, -0.07930711, -0.31941393, -0.29101238, + 0.40383977, -0.3717381 , 0.03487907, -0.33628452, -0.4465965 , + -0.27571818, -0.0488493 , 0.12399682, 0.31216368, -0.01752434, + 0.32131058, -0.38280696, 0.24619998, -0.34979105, -0.16987896, + -0.18326469, -0.14740779, -0.1095791 , 0.03177686, -0.30144197, + -0.30499157, -0.37426412, -0.18248272, 0.3032632 , -0.09528783, + 0.59990335, 0.05005969, 0.04626458, 0.36565247, 0.21673569, + -0.2155152 , 0.33764148, 0.4421136 , -0.56542957, 0.29158652, + -0.08375975, 0.22272962, -0.02998246, 0.05934277, 0.3240713 , + 0.1511237 , -0.43450487, -0.7087094 , -0.13446207, -0.10318276, + -0.6806781 , 0.42355484, 0.1661925 , -0.01327086, -0.2432955 , + -0.02126789, 0.34654808, -0.12292334, 0.06645596, -0.1795192 , + 0.5156855 , -0.19275527, -0.24794976, -0.01581961, -0.2081413 ], dtype=float32) +.. GENERATED FROM PYTHON SOURCE LINES 195-198 + Similarity operations --------------------- +.. GENERATED FROM PYTHON SOURCE LINES 202-204 + Similarity operations work the same way as word2vec. **Out-of-vocabulary words can also be used, provided they have at least one character ngram present in the training data.** +.. GENERATED FROM PYTHON SOURCE LINES 205-209 .. code-block:: default @@ -348,8 +415,6 @@ Similarity operations work the same way as word2vec. **Out-of-vocabulary words c .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none False @@ -357,6 +422,7 @@ Similarity operations work the same way as word2vec. **Out-of-vocabulary words c +.. GENERATED FROM PYTHON SOURCE LINES 211-213 .. code-block:: default @@ -368,8 +434,6 @@ Similarity operations work the same way as word2vec. **Out-of-vocabulary words c .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none True @@ -377,6 +441,7 @@ Similarity operations work the same way as word2vec. **Out-of-vocabulary words c +.. GENERATED FROM PYTHON SOURCE LINES 215-217 .. code-block:: default @@ -388,23 +453,26 @@ Similarity operations work the same way as word2vec. **Out-of-vocabulary words c .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - 0.9999929 + 0.9999918 + +.. GENERATED FROM PYTHON SOURCE LINES 218-220 Syntactically similar words generally have high similarity in fastText models, since a large number of the component char-ngrams will be the same. As a result, fastText generally does better at syntactic tasks than Word2Vec. A detailed comparison is provided `here `_. +.. GENERATED FROM PYTHON SOURCE LINES 224-228 + Other similarity operations ^^^^^^^^^^^^^^^^^^^^^^^^^^^ The example training corpus is a toy corpus, results are not expected to be good, for proof-of-concept only +.. GENERATED FROM PYTHON SOURCE LINES 229-231 .. code-block:: default @@ -416,24 +484,23 @@ The example training corpus is a toy corpus, results are not expected to be good .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - [('night', 0.9999929070472717), - ('night.', 0.9999895095825195), - ('flights', 0.999988853931427), - ('rights', 0.9999886751174927), - ('residents', 0.9999884366989136), - ('overnight', 0.9999883770942688), - ('commanders', 0.999988317489624), - ('reached', 0.9999881386756897), - ('commander', 0.9999880790710449), - ('leading', 0.999987781047821)] + [('night', 0.9999918341636658), + ('rights', 0.9999877214431763), + ('flights', 0.9999877214431763), + ('overnight', 0.999987006187439), + ('fighting', 0.9999857544898987), + ('fighters', 0.9999855160713196), + ('fight', 0.9999852180480957), + ('entered', 0.9999851584434509), + ('fighter', 0.999984860420227), + ('eight', 0.999984622001648)] +.. GENERATED FROM PYTHON SOURCE LINES 233-235 .. code-block:: default @@ -445,15 +512,14 @@ The example training corpus is a toy corpus, results are not expected to be good .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - 0.9999402 + 0.99993986 +.. GENERATED FROM PYTHON SOURCE LINES 237-239 .. code-block:: default @@ -465,8 +531,6 @@ The example training corpus is a toy corpus, results are not expected to be good .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none 'lunch' @@ -474,6 +538,7 @@ The example training corpus is a toy corpus, results are not expected to be good +.. GENERATED FROM PYTHON SOURCE LINES 241-243 .. code-block:: default @@ -485,24 +550,23 @@ The example training corpus is a toy corpus, results are not expected to be good .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - [('attempt', 0.999660074710846), - ('biggest', 0.9996545314788818), - ('again', 0.9996527433395386), - ('against', 0.9996523857116699), - ('doubles', 0.9996522068977356), - ('Royal', 0.9996512532234192), - ('Airlines', 0.9996494054794312), - ('forced', 0.9996494054794312), - ('arrest', 0.9996492266654968), - ('follows', 0.999649167060852)] + [('find', 0.9996394515037537), + ('capital,', 0.999639093875885), + ('findings', 0.9996339082717896), + ('seekers.', 0.9996323585510254), + ('field', 0.9996322393417358), + ('finding', 0.9996311664581299), + ('had', 0.9996305704116821), + ('abuse', 0.9996281862258911), + ('storm', 0.9996268153190613), + ('heading', 0.9996247291564941)] +.. GENERATED FROM PYTHON SOURCE LINES 245-247 .. code-block:: default @@ -514,11 +578,20 @@ The example training corpus is a toy corpus, results are not expected to be good .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - (0.24489795918367346, + 2022-10-23 11:05:26,790 : INFO : Evaluating word analogies for top 300000 words in the model on /home/thomas/Documents/FOSS/gensim-tlouf/gensim/test/test_data/questions-words.txt + 2022-10-23 11:05:26,814 : INFO : family: 0.0% (0/2) + 2022-10-23 11:05:26,822 : INFO : gram3-comparative: 8.3% (1/12) + 2022-10-23 11:05:26,827 : INFO : gram4-superlative: 33.3% (4/12) + 2022-10-23 11:05:26,832 : INFO : gram5-present-participle: 45.0% (9/20) + 2022-10-23 11:05:26,845 : INFO : gram6-nationality-adjective: 30.0% (6/20) + 2022-10-23 11:05:26,851 : INFO : gram7-past-tense: 5.0% (1/20) + 2022-10-23 11:05:26,856 : INFO : gram8-plural: 33.3% (4/12) + 2022-10-23 11:05:26,858 : INFO : Quadruplets with out-of-vocabulary words: 99.5% + 2022-10-23 11:05:26,859 : INFO : NB: analogies containing OOV words were skipped from evaluation! To change this behavior, use "dummy4unknown=True" + 2022-10-23 11:05:26,859 : INFO : Total accuracy: 25.5% (25/98) + (0.25510204081632654, [{'correct': [], 'incorrect': [], 'section': 'capital-common-countries'}, {'correct': [], 'incorrect': [], 'section': 'capital-world'}, {'correct': [], 'incorrect': [], 'section': 'currency'}, @@ -528,81 +601,80 @@ The example training corpus is a toy corpus, results are not expected to be good 'section': 'family'}, {'correct': [], 'incorrect': [], 'section': 'gram1-adjective-to-adverb'}, {'correct': [], 'incorrect': [], 'section': 'gram2-opposite'}, - {'correct': [('GOOD', 'BETTER', 'LOW', 'LOWER'), - ('GREAT', 'GREATER', 'LOW', 'LOWER'), - ('LONG', 'LONGER', 'LOW', 'LOWER')], + {'correct': [('LONG', 'LONGER', 'GREAT', 'GREATER')], 'incorrect': [('GOOD', 'BETTER', 'GREAT', 'GREATER'), ('GOOD', 'BETTER', 'LONG', 'LONGER'), + ('GOOD', 'BETTER', 'LOW', 'LOWER'), ('GREAT', 'GREATER', 'LONG', 'LONGER'), + ('GREAT', 'GREATER', 'LOW', 'LOWER'), ('GREAT', 'GREATER', 'GOOD', 'BETTER'), + ('LONG', 'LONGER', 'LOW', 'LOWER'), ('LONG', 'LONGER', 'GOOD', 'BETTER'), - ('LONG', 'LONGER', 'GREAT', 'GREATER'), ('LOW', 'LOWER', 'GOOD', 'BETTER'), ('LOW', 'LOWER', 'GREAT', 'GREATER'), ('LOW', 'LOWER', 'LONG', 'LONGER')], 'section': 'gram3-comparative'}, - {'correct': [('BIG', 'BIGGEST', 'LARGE', 'LARGEST'), - ('GOOD', 'BEST', 'LARGE', 'LARGEST'), - ('GREAT', 'GREATEST', 'LARGE', 'LARGEST')], + {'correct': [('GOOD', 'BEST', 'LARGE', 'LARGEST'), + ('GREAT', 'GREATEST', 'LARGE', 'LARGEST'), + ('GREAT', 'GREATEST', 'BIG', 'BIGGEST'), + ('LARGE', 'LARGEST', 'BIG', 'BIGGEST')], 'incorrect': [('BIG', 'BIGGEST', 'GOOD', 'BEST'), ('BIG', 'BIGGEST', 'GREAT', 'GREATEST'), + ('BIG', 'BIGGEST', 'LARGE', 'LARGEST'), ('GOOD', 'BEST', 'GREAT', 'GREATEST'), ('GOOD', 'BEST', 'BIG', 'BIGGEST'), - ('GREAT', 'GREATEST', 'BIG', 'BIGGEST'), ('GREAT', 'GREATEST', 'GOOD', 'BEST'), - ('LARGE', 'LARGEST', 'BIG', 'BIGGEST'), ('LARGE', 'LARGEST', 'GOOD', 'BEST'), ('LARGE', 'LARGEST', 'GREAT', 'GREATEST')], 'section': 'gram4-superlative'}, - {'correct': [('GO', 'GOING', 'SAY', 'SAYING'), - ('LOOK', 'LOOKING', 'PLAY', 'PLAYING'), + {'correct': [('GO', 'GOING', 'PLAY', 'PLAYING'), + ('GO', 'GOING', 'SAY', 'SAYING'), ('LOOK', 'LOOKING', 'SAY', 'SAYING'), ('LOOK', 'LOOKING', 'GO', 'GOING'), ('PLAY', 'PLAYING', 'SAY', 'SAYING'), ('PLAY', 'PLAYING', 'GO', 'GOING'), - ('SAY', 'SAYING', 'GO', 'GOING')], + ('PLAY', 'PLAYING', 'LOOK', 'LOOKING'), + ('SAY', 'SAYING', 'GO', 'GOING'), + ('SAY', 'SAYING', 'PLAY', 'PLAYING')], 'incorrect': [('GO', 'GOING', 'LOOK', 'LOOKING'), - ('GO', 'GOING', 'PLAY', 'PLAYING'), ('GO', 'GOING', 'RUN', 'RUNNING'), + ('LOOK', 'LOOKING', 'PLAY', 'PLAYING'), ('LOOK', 'LOOKING', 'RUN', 'RUNNING'), ('PLAY', 'PLAYING', 'RUN', 'RUNNING'), - ('PLAY', 'PLAYING', 'LOOK', 'LOOKING'), ('RUN', 'RUNNING', 'SAY', 'SAYING'), ('RUN', 'RUNNING', 'GO', 'GOING'), ('RUN', 'RUNNING', 'LOOK', 'LOOKING'), ('RUN', 'RUNNING', 'PLAY', 'PLAYING'), ('SAY', 'SAYING', 'LOOK', 'LOOKING'), - ('SAY', 'SAYING', 'PLAY', 'PLAYING'), ('SAY', 'SAYING', 'RUN', 'RUNNING')], 'section': 'gram5-present-participle'}, {'correct': [('AUSTRALIA', 'AUSTRALIAN', 'INDIA', 'INDIAN'), ('AUSTRALIA', 'AUSTRALIAN', 'ISRAEL', 'ISRAELI'), ('FRANCE', 'FRENCH', 'INDIA', 'INDIAN'), ('INDIA', 'INDIAN', 'ISRAEL', 'ISRAELI'), - ('ISRAEL', 'ISRAELI', 'INDIA', 'INDIAN'), - ('SWITZERLAND', 'SWISS', 'INDIA', 'INDIAN')], + ('INDIA', 'INDIAN', 'AUSTRALIA', 'AUSTRALIAN'), + ('ISRAEL', 'ISRAELI', 'INDIA', 'INDIAN')], 'incorrect': [('AUSTRALIA', 'AUSTRALIAN', 'FRANCE', 'FRENCH'), ('AUSTRALIA', 'AUSTRALIAN', 'SWITZERLAND', 'SWISS'), ('FRANCE', 'FRENCH', 'ISRAEL', 'ISRAELI'), ('FRANCE', 'FRENCH', 'SWITZERLAND', 'SWISS'), ('FRANCE', 'FRENCH', 'AUSTRALIA', 'AUSTRALIAN'), ('INDIA', 'INDIAN', 'SWITZERLAND', 'SWISS'), - ('INDIA', 'INDIAN', 'AUSTRALIA', 'AUSTRALIAN'), ('INDIA', 'INDIAN', 'FRANCE', 'FRENCH'), ('ISRAEL', 'ISRAELI', 'SWITZERLAND', 'SWISS'), ('ISRAEL', 'ISRAELI', 'AUSTRALIA', 'AUSTRALIAN'), ('ISRAEL', 'ISRAELI', 'FRANCE', 'FRENCH'), ('SWITZERLAND', 'SWISS', 'AUSTRALIA', 'AUSTRALIAN'), ('SWITZERLAND', 'SWISS', 'FRANCE', 'FRENCH'), + ('SWITZERLAND', 'SWISS', 'INDIA', 'INDIAN'), ('SWITZERLAND', 'SWISS', 'ISRAEL', 'ISRAELI')], 'section': 'gram6-nationality-adjective'}, - {'correct': [], + {'correct': [('PAYING', 'PAID', 'SAYING', 'SAID')], 'incorrect': [('GOING', 'WENT', 'PAYING', 'PAID'), ('GOING', 'WENT', 'PLAYING', 'PLAYED'), ('GOING', 'WENT', 'SAYING', 'SAID'), ('GOING', 'WENT', 'TAKING', 'TOOK'), ('PAYING', 'PAID', 'PLAYING', 'PLAYED'), - ('PAYING', 'PAID', 'SAYING', 'SAID'), ('PAYING', 'PAID', 'TAKING', 'TOOK'), ('PAYING', 'PAID', 'GOING', 'WENT'), ('PLAYING', 'PLAYED', 'SAYING', 'SAID'), @@ -618,76 +690,76 @@ The example training corpus is a toy corpus, results are not expected to be good ('TAKING', 'TOOK', 'PLAYING', 'PLAYED'), ('TAKING', 'TOOK', 'SAYING', 'SAID')], 'section': 'gram7-past-tense'}, - {'correct': [('BUILDING', 'BUILDINGS', 'CAR', 'CARS'), - ('BUILDING', 'BUILDINGS', 'CHILD', 'CHILDREN'), - ('CAR', 'CARS', 'BUILDING', 'BUILDINGS'), - ('CHILD', 'CHILDREN', 'CAR', 'CARS'), - ('MAN', 'MEN', 'CAR', 'CARS')], - 'incorrect': [('BUILDING', 'BUILDINGS', 'MAN', 'MEN'), - ('CAR', 'CARS', 'CHILD', 'CHILDREN'), + {'correct': [('BUILDING', 'BUILDINGS', 'CHILD', 'CHILDREN'), + ('CAR', 'CARS', 'CHILD', 'CHILDREN'), + ('MAN', 'MEN', 'BUILDING', 'BUILDINGS'), + ('MAN', 'MEN', 'CHILD', 'CHILDREN')], + 'incorrect': [('BUILDING', 'BUILDINGS', 'CAR', 'CARS'), + ('BUILDING', 'BUILDINGS', 'MAN', 'MEN'), ('CAR', 'CARS', 'MAN', 'MEN'), + ('CAR', 'CARS', 'BUILDING', 'BUILDINGS'), ('CHILD', 'CHILDREN', 'MAN', 'MEN'), ('CHILD', 'CHILDREN', 'BUILDING', 'BUILDINGS'), - ('MAN', 'MEN', 'BUILDING', 'BUILDINGS'), - ('MAN', 'MEN', 'CHILD', 'CHILDREN')], + ('CHILD', 'CHILDREN', 'CAR', 'CARS'), + ('MAN', 'MEN', 'CAR', 'CARS')], 'section': 'gram8-plural'}, {'correct': [], 'incorrect': [], 'section': 'gram9-plural-verbs'}, - {'correct': [('GOOD', 'BETTER', 'LOW', 'LOWER'), - ('GREAT', 'GREATER', 'LOW', 'LOWER'), - ('LONG', 'LONGER', 'LOW', 'LOWER'), - ('BIG', 'BIGGEST', 'LARGE', 'LARGEST'), + {'correct': [('LONG', 'LONGER', 'GREAT', 'GREATER'), ('GOOD', 'BEST', 'LARGE', 'LARGEST'), ('GREAT', 'GREATEST', 'LARGE', 'LARGEST'), + ('GREAT', 'GREATEST', 'BIG', 'BIGGEST'), + ('LARGE', 'LARGEST', 'BIG', 'BIGGEST'), + ('GO', 'GOING', 'PLAY', 'PLAYING'), ('GO', 'GOING', 'SAY', 'SAYING'), - ('LOOK', 'LOOKING', 'PLAY', 'PLAYING'), ('LOOK', 'LOOKING', 'SAY', 'SAYING'), ('LOOK', 'LOOKING', 'GO', 'GOING'), ('PLAY', 'PLAYING', 'SAY', 'SAYING'), ('PLAY', 'PLAYING', 'GO', 'GOING'), + ('PLAY', 'PLAYING', 'LOOK', 'LOOKING'), ('SAY', 'SAYING', 'GO', 'GOING'), + ('SAY', 'SAYING', 'PLAY', 'PLAYING'), ('AUSTRALIA', 'AUSTRALIAN', 'INDIA', 'INDIAN'), ('AUSTRALIA', 'AUSTRALIAN', 'ISRAEL', 'ISRAELI'), ('FRANCE', 'FRENCH', 'INDIA', 'INDIAN'), ('INDIA', 'INDIAN', 'ISRAEL', 'ISRAELI'), + ('INDIA', 'INDIAN', 'AUSTRALIA', 'AUSTRALIAN'), ('ISRAEL', 'ISRAELI', 'INDIA', 'INDIAN'), - ('SWITZERLAND', 'SWISS', 'INDIA', 'INDIAN'), - ('BUILDING', 'BUILDINGS', 'CAR', 'CARS'), + ('PAYING', 'PAID', 'SAYING', 'SAID'), ('BUILDING', 'BUILDINGS', 'CHILD', 'CHILDREN'), - ('CAR', 'CARS', 'BUILDING', 'BUILDINGS'), - ('CHILD', 'CHILDREN', 'CAR', 'CARS'), - ('MAN', 'MEN', 'CAR', 'CARS')], + ('CAR', 'CARS', 'CHILD', 'CHILDREN'), + ('MAN', 'MEN', 'BUILDING', 'BUILDINGS'), + ('MAN', 'MEN', 'CHILD', 'CHILDREN')], 'incorrect': [('HE', 'SHE', 'HIS', 'HER'), ('HIS', 'HER', 'HE', 'SHE'), ('GOOD', 'BETTER', 'GREAT', 'GREATER'), ('GOOD', 'BETTER', 'LONG', 'LONGER'), + ('GOOD', 'BETTER', 'LOW', 'LOWER'), ('GREAT', 'GREATER', 'LONG', 'LONGER'), + ('GREAT', 'GREATER', 'LOW', 'LOWER'), ('GREAT', 'GREATER', 'GOOD', 'BETTER'), + ('LONG', 'LONGER', 'LOW', 'LOWER'), ('LONG', 'LONGER', 'GOOD', 'BETTER'), - ('LONG', 'LONGER', 'GREAT', 'GREATER'), ('LOW', 'LOWER', 'GOOD', 'BETTER'), ('LOW', 'LOWER', 'GREAT', 'GREATER'), ('LOW', 'LOWER', 'LONG', 'LONGER'), ('BIG', 'BIGGEST', 'GOOD', 'BEST'), ('BIG', 'BIGGEST', 'GREAT', 'GREATEST'), + ('BIG', 'BIGGEST', 'LARGE', 'LARGEST'), ('GOOD', 'BEST', 'GREAT', 'GREATEST'), ('GOOD', 'BEST', 'BIG', 'BIGGEST'), - ('GREAT', 'GREATEST', 'BIG', 'BIGGEST'), ('GREAT', 'GREATEST', 'GOOD', 'BEST'), - ('LARGE', 'LARGEST', 'BIG', 'BIGGEST'), ('LARGE', 'LARGEST', 'GOOD', 'BEST'), ('LARGE', 'LARGEST', 'GREAT', 'GREATEST'), ('GO', 'GOING', 'LOOK', 'LOOKING'), - ('GO', 'GOING', 'PLAY', 'PLAYING'), ('GO', 'GOING', 'RUN', 'RUNNING'), + ('LOOK', 'LOOKING', 'PLAY', 'PLAYING'), ('LOOK', 'LOOKING', 'RUN', 'RUNNING'), ('PLAY', 'PLAYING', 'RUN', 'RUNNING'), - ('PLAY', 'PLAYING', 'LOOK', 'LOOKING'), ('RUN', 'RUNNING', 'SAY', 'SAYING'), ('RUN', 'RUNNING', 'GO', 'GOING'), ('RUN', 'RUNNING', 'LOOK', 'LOOKING'), ('RUN', 'RUNNING', 'PLAY', 'PLAYING'), ('SAY', 'SAYING', 'LOOK', 'LOOKING'), - ('SAY', 'SAYING', 'PLAY', 'PLAYING'), ('SAY', 'SAYING', 'RUN', 'RUNNING'), ('AUSTRALIA', 'AUSTRALIAN', 'FRANCE', 'FRENCH'), ('AUSTRALIA', 'AUSTRALIAN', 'SWITZERLAND', 'SWISS'), @@ -695,20 +767,19 @@ The example training corpus is a toy corpus, results are not expected to be good ('FRANCE', 'FRENCH', 'SWITZERLAND', 'SWISS'), ('FRANCE', 'FRENCH', 'AUSTRALIA', 'AUSTRALIAN'), ('INDIA', 'INDIAN', 'SWITZERLAND', 'SWISS'), - ('INDIA', 'INDIAN', 'AUSTRALIA', 'AUSTRALIAN'), ('INDIA', 'INDIAN', 'FRANCE', 'FRENCH'), ('ISRAEL', 'ISRAELI', 'SWITZERLAND', 'SWISS'), ('ISRAEL', 'ISRAELI', 'AUSTRALIA', 'AUSTRALIAN'), ('ISRAEL', 'ISRAELI', 'FRANCE', 'FRENCH'), ('SWITZERLAND', 'SWISS', 'AUSTRALIA', 'AUSTRALIAN'), ('SWITZERLAND', 'SWISS', 'FRANCE', 'FRENCH'), + ('SWITZERLAND', 'SWISS', 'INDIA', 'INDIAN'), ('SWITZERLAND', 'SWISS', 'ISRAEL', 'ISRAELI'), ('GOING', 'WENT', 'PAYING', 'PAID'), ('GOING', 'WENT', 'PLAYING', 'PLAYED'), ('GOING', 'WENT', 'SAYING', 'SAID'), ('GOING', 'WENT', 'TAKING', 'TOOK'), ('PAYING', 'PAID', 'PLAYING', 'PLAYED'), - ('PAYING', 'PAID', 'SAYING', 'SAID'), ('PAYING', 'PAID', 'TAKING', 'TOOK'), ('PAYING', 'PAID', 'GOING', 'WENT'), ('PLAYING', 'PLAYED', 'SAYING', 'SAID'), @@ -723,25 +794,29 @@ The example training corpus is a toy corpus, results are not expected to be good ('TAKING', 'TOOK', 'PAYING', 'PAID'), ('TAKING', 'TOOK', 'PLAYING', 'PLAYED'), ('TAKING', 'TOOK', 'SAYING', 'SAID'), + ('BUILDING', 'BUILDINGS', 'CAR', 'CARS'), ('BUILDING', 'BUILDINGS', 'MAN', 'MEN'), - ('CAR', 'CARS', 'CHILD', 'CHILDREN'), ('CAR', 'CARS', 'MAN', 'MEN'), + ('CAR', 'CARS', 'BUILDING', 'BUILDINGS'), ('CHILD', 'CHILDREN', 'MAN', 'MEN'), ('CHILD', 'CHILDREN', 'BUILDING', 'BUILDINGS'), - ('MAN', 'MEN', 'BUILDING', 'BUILDINGS'), - ('MAN', 'MEN', 'CHILD', 'CHILDREN')], + ('CHILD', 'CHILDREN', 'CAR', 'CARS'), + ('MAN', 'MEN', 'CAR', 'CARS')], 'section': 'Total accuracy'}]) +.. GENERATED FROM PYTHON SOURCE LINES 248-254 + Word Movers distance ^^^^^^^^^^^^^^^^^^^^ -You'll need the optional ``pyemd`` library for this section, ``pip install pyemd``. +You'll need the optional ``POT`` library for this section, ``pip install POT``. Let's start with two sentences: +.. GENERATED FROM PYTHON SOURCE LINES 254-258 .. code-block:: default @@ -756,9 +831,12 @@ Let's start with two sentences: +.. GENERATED FROM PYTHON SOURCE LINES 259-261 + Remove their stopwords. +.. GENERATED FROM PYTHON SOURCE LINES 261-265 .. code-block:: default @@ -773,8 +851,11 @@ Remove their stopwords. +.. GENERATED FROM PYTHON SOURCE LINES 266-267 + Compute the Word Movers Distance between the two sentences. +.. GENERATED FROM PYTHON SOURCE LINES 267-270 .. code-block:: default @@ -787,18 +868,22 @@ Compute the Word Movers Distance between the two sentences. .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - 'Word Movers Distance is 0.015923231075180694 (lower means closer)' + 2022-10-23 11:05:27,139 : INFO : adding document #0 to Dictionary<0 unique tokens: []> + 2022-10-23 11:05:27,140 : INFO : built Dictionary<8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...> from 2 documents (total 8 corpus positions) + 2022-10-23 11:05:27,140 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...> from 2 documents (total 8 corpus positions)", 'datetime': '2022-10-23T11:05:27.140129', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'created'} + 'Word Movers Distance is 0.01600033861640832 (lower means closer)' +.. GENERATED FROM PYTHON SOURCE LINES 271-273 + That's all! You've made it to the end of this tutorial. +.. GENERATED FROM PYTHON SOURCE LINES 273-278 .. code-block:: default @@ -810,9 +895,10 @@ That's all! You've made it to the end of this tutorial. -.. image:: /auto_examples/tutorials/images/sphx_glr_run_fasttext_001.png - :alt: run fasttext - :class: sphx-glr-single-img +.. image-sg:: /auto_examples/tutorials/images/sphx_glr_run_fasttext_001.png + :alt: run fasttext + :srcset: /auto_examples/tutorials/images/sphx_glr_run_fasttext_001.png + :class: sphx-glr-single-img @@ -821,30 +907,25 @@ That's all! You've made it to the end of this tutorial. .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 0 minutes 28.645 seconds) + **Total running time of the script:** ( 0 minutes 7.208 seconds) -**Estimated memory usage:** 2975 MB +**Estimated memory usage:** 1619 MB .. _sphx_glr_download_auto_examples_tutorials_run_fasttext.py: +.. only:: html -.. only :: html - - .. container:: sphx-glr-footer - :class: sphx-glr-footer-example - - - - .. container:: sphx-glr-download sphx-glr-download-python + .. container:: sphx-glr-footer sphx-glr-footer-example - :download:`Download Python source code: run_fasttext.py ` + .. container:: sphx-glr-download sphx-glr-download-python + :download:`Download Python source code: run_fasttext.py ` - .. container:: sphx-glr-download sphx-glr-download-jupyter + .. container:: sphx-glr-download sphx-glr-download-jupyter - :download:`Download Jupyter notebook: run_fasttext.ipynb ` + :download:`Download Jupyter notebook: run_fasttext.ipynb ` .. only:: html diff --git a/docs/src/auto_examples/tutorials/run_fasttext_codeobj.pickle b/docs/src/auto_examples/tutorials/run_fasttext_codeobj.pickle new file mode 100644 index 0000000000..1be3e19b04 Binary files /dev/null and b/docs/src/auto_examples/tutorials/run_fasttext_codeobj.pickle differ diff --git a/docs/src/auto_examples/tutorials/run_wmd.ipynb b/docs/src/auto_examples/tutorials/run_wmd.ipynb index 99711a9278..89a002f539 100644 --- a/docs/src/auto_examples/tutorials/run_wmd.ipynb +++ b/docs/src/auto_examples/tutorials/run_wmd.ipynb @@ -15,14 +15,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\nWord Mover's Distance\n=====================\n\nDemonstrates using Gensim's implemenation of the WMD.\n" + "\n# Word Mover's Distance\n\nDemonstrates using Gensim's implemenation of the WMD.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Word Mover's Distance (WMD) is a promising new tool in machine learning that\nallows us to submit a query and return the most relevant documents. This\ntutorial introduces WMD and shows how you can compute the WMD distance\nbetween two documents using ``wmdistance``.\n\nWMD Basics\n----------\n\nWMD enables us to assess the \"distance\" between two documents in a meaningful\nway even when they have no words in common. It uses `word2vec\n`_ [4] vector embeddings of\nwords. It been shown to outperform many of the state-of-the-art methods in\nk-nearest neighbors classification [3].\n\nWMD is illustrated below for two very similar sentences (illustration taken\nfrom `Vlad Niculae's blog\n`_). The sentences\nhave no words in common, but by matching the relevant words, WMD is able to\naccurately measure the (dis)similarity between the two sentences. The method\nalso uses the bag-of-words representation of the documents (simply put, the\nword's frequencies in the documents), noted as $d$ in the figure below. The\nintuition behind the method is that we find the minimum \"traveling distance\"\nbetween documents, in other words the most efficient way to \"move\" the\ndistribution of document 1 to the distribution of document 2.\n\n\n" + "Word Mover's Distance (WMD) is a promising new tool in machine learning that\nallows us to submit a query and return the most relevant documents. This\ntutorial introduces WMD and shows how you can compute the WMD distance\nbetween two documents using ``wmdistance``.\n\n## WMD Basics\n\nWMD enables us to assess the \"distance\" between two documents in a meaningful\nway even when they have no words in common. It uses [word2vec](http://rare-technologies.com/word2vec-tutorial/) [4] vector embeddings of\nwords. It been shown to outperform many of the state-of-the-art methods in\nk-nearest neighbors classification [3].\n\nWMD is illustrated below for two very similar sentences (illustration taken\nfrom [Vlad Niculae's blog](http://vene.ro/blog/word-movers-distance-in-python.html)). The sentences\nhave no words in common, but by matching the relevant words, WMD is able to\naccurately measure the (dis)similarity between the two sentences. The method\nalso uses the bag-of-words representation of the documents (simply put, the\nword's frequencies in the documents), noted as $d$ in the figure below. The\nintuition behind the method is that we find the minimum \"traveling distance\"\nbetween documents, in other words the most efficient way to \"move\" the\ndistribution of document 1 to the distribution of document 2.\n\n\n" ] }, { @@ -40,7 +40,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This method was introduced in the article \"From Word Embeddings To Document\nDistances\" by Matt Kusner et al. (\\ `link to PDF\n`_\\ ). It is inspired\nby the \"Earth Mover's Distance\", and employs a solver of the \"transportation\nproblem\".\n\nIn this tutorial, we will learn how to use Gensim's WMD functionality, which\nconsists of the ``wmdistance`` method for distance computation, and the\n``WmdSimilarity`` class for corpus based similarity queries.\n\n.. Important::\n If you use Gensim's WMD functionality, please consider citing [1], [2] and [3].\n\nComputing the Word Mover's Distance\n-----------------------------------\n\nTo use WMD, you need some existing word embeddings.\nYou could train your own Word2Vec model, but that is beyond the scope of this tutorial\n(check out `sphx_glr_auto_examples_tutorials_run_word2vec.py` if you're interested).\nFor this tutorial, we'll be using an existing Word2Vec model.\n\nLet's take some sentences to compute the distance between.\n\n\n" + "This method was introduced in the article \"From Word Embeddings To Document\nDistances\" by Matt Kusner et al. (\\ [link to PDF](http://jmlr.org/proceedings/papers/v37/kusnerb15.pdf)\\ ). It is inspired\nby the \"Earth Mover's Distance\", and employs a solver of the \"transportation\nproblem\".\n\nIn this tutorial, we will learn how to use Gensim's WMD functionality, which\nconsists of the ``wmdistance`` method for distance computation, and the\n``WmdSimilarity`` class for corpus based similarity queries.\n\n.. Important::\n If you use Gensim's WMD functionality, please consider citing [1] and [2].\n\n## Computing the Word Mover's Distance\n\nTo use WMD, you need some existing word embeddings.\nYou could train your own Word2Vec model, but that is beyond the scope of this tutorial\n(check out `sphx_glr_auto_examples_tutorials_run_word2vec.py` if you're interested).\nFor this tutorial, we'll be using an existing Word2Vec model.\n\nLet's take some sentences to compute the distance between.\n\n\n" ] }, { @@ -130,7 +130,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "References\n----------\n\n1. Ofir Pele and Michael Werman, *A linear time histogram metric for improved SIFT matching*, 2008.\n2. Ofir Pele and Michael Werman, *Fast and robust earth mover's distances*, 2009.\n3. Matt Kusner et al. *From Embeddings To Document Distances*, 2015.\n4. Tom\u00e1\u0161 Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013.\n\n\n" + "## References\n\n1. R\u00e9mi Flamary et al. *POT: Python Optimal Transport*, 2021.\n2. Matt Kusner et al. *From Embeddings To Document Distances*, 2015.\n3. Tom\u00e1\u0161 Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013.\n\n\n" ] } ], @@ -150,7 +150,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/docs/src/auto_examples/tutorials/run_wmd.py b/docs/src/auto_examples/tutorials/run_wmd.py index 06e263063a..c037ef697b 100644 --- a/docs/src/auto_examples/tutorials/run_wmd.py +++ b/docs/src/auto_examples/tutorials/run_wmd.py @@ -53,7 +53,7 @@ # ``WmdSimilarity`` class for corpus based similarity queries. # # .. Important:: -# If you use Gensim's WMD functionality, please consider citing [1], [2] and [3]. +# If you use Gensim's WMD functionality, please consider citing [1] and [2]. # # Computing the Word Mover's Distance # ----------------------------------- @@ -118,8 +118,7 @@ def preprocess(sentence): # References # ---------- # -# 1. Ofir Pele and Michael Werman, *A linear time histogram metric for improved SIFT matching*, 2008. -# 2. Ofir Pele and Michael Werman, *Fast and robust earth mover's distances*, 2009. -# 3. Matt Kusner et al. *From Embeddings To Document Distances*, 2015. -# 4. Tomáš Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013. +# 1. Rémi Flamary et al. *POT: Python Optimal Transport*, 2021. +# 2. Matt Kusner et al. *From Embeddings To Document Distances*, 2015. +# 3. Tomáš Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013. # diff --git a/docs/src/auto_examples/tutorials/run_wmd.py.md5 b/docs/src/auto_examples/tutorials/run_wmd.py.md5 index 382c5b9954..b6772e20fb 100644 --- a/docs/src/auto_examples/tutorials/run_wmd.py.md5 +++ b/docs/src/auto_examples/tutorials/run_wmd.py.md5 @@ -1 +1 @@ -45521a352637a0f53e62f3e19e61fc07 \ No newline at end of file +a087a5b43fbba9a3e71c2384ddc264af \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_wmd.rst b/docs/src/auto_examples/tutorials/run_wmd.rst index cc62e120dd..6a04bcc412 100644 --- a/docs/src/auto_examples/tutorials/run_wmd.rst +++ b/docs/src/auto_examples/tutorials/run_wmd.rst @@ -1,7 +1,18 @@ -.. note:: - :class: sphx-glr-download-link-note - Click :ref:`here ` to download the full example code +.. DO NOT EDIT. +.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. +.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: +.. "auto_examples/tutorials/run_wmd.py" +.. LINE NUMBERS ARE GIVEN BELOW. + +.. only:: html + + .. note:: + :class: sphx-glr-download-link-note + + Click :ref:`here ` + to download the full example code + .. rst-class:: sphx-glr-example-title .. _sphx_glr_auto_examples_tutorials_run_wmd.py: @@ -12,6 +23,8 @@ Word Mover's Distance Demonstrates using Gensim's implemenation of the WMD. +.. GENERATED FROM PYTHON SOURCE LINES 10-35 + Word Mover's Distance (WMD) is a promising new tool in machine learning that allows us to submit a query and return the most relevant documents. This tutorial introduces WMD and shows how you can compute the WMD distance @@ -38,6 +51,7 @@ between documents, in other words the most efficient way to "move" the distribution of document 1 to the distribution of document 2. +.. GENERATED FROM PYTHON SOURCE LINES 35-44 .. code-block:: default @@ -53,11 +67,16 @@ distribution of document 1 to the distribution of document 2. -.. image:: /auto_examples/tutorials/images/sphx_glr_run_wmd_001.png - :class: sphx-glr-single-img +.. image-sg:: /auto_examples/tutorials/images/sphx_glr_run_wmd_001.png + :alt: run wmd + :srcset: /auto_examples/tutorials/images/sphx_glr_run_wmd_001.png + :class: sphx-glr-single-img + + +.. GENERATED FROM PYTHON SOURCE LINES 45-68 This method was introduced in the article "From Word Embeddings To Document Distances" by Matt Kusner et al. (\ `link to PDF @@ -70,7 +89,7 @@ consists of the ``wmdistance`` method for distance computation, and the ``WmdSimilarity`` class for corpus based similarity queries. .. Important:: - If you use Gensim's WMD functionality, please consider citing [1], [2] and [3]. + If you use Gensim's WMD functionality, please consider citing [1] and [2]. Computing the Word Mover's Distance ----------------------------------- @@ -83,6 +102,7 @@ For this tutorial, we'll be using an existing Word2Vec model. Let's take some sentences to compute the distance between. +.. GENERATED FROM PYTHON SOURCE LINES 68-76 .. code-block:: default @@ -100,11 +120,15 @@ Let's take some sentences to compute the distance between. + +.. GENERATED FROM PYTHON SOURCE LINES 77-81 + These sentences have very similar content, and as such the WMD should be low. Before we compute the WMD, we want to remove stopwords ("the", "to", etc.), as these do not contribute a lot to the information in the sentences. +.. GENERATED FROM PYTHON SOURCE LINES 81-94 .. code-block:: default @@ -127,45 +151,16 @@ as these do not contribute a lot to the information in the sentences. .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - dtype=np.int): - /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:30: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - method='lar', copy_X=True, eps=np.finfo(np.float).eps, - /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:167: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - method='lar', copy_X=True, eps=np.finfo(np.float).eps, - /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:284: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0, - /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1101: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1127: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, positive=False): - /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1362: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps, - /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1602: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps, - /home/witiko/.virtualenvs/gensim4/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1738: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, copy_X=True, positive=False): - [nltk_data] Downloading package stopwords to /home/witiko/nltk_data... + [nltk_data] Downloading package stopwords to /home/thomas/nltk_data... [nltk_data] Package stopwords is already up-to-date! + +.. GENERATED FROM PYTHON SOURCE LINES 95-101 + Now, as mentioned earlier, we will be using some downloaded pre-trained embeddings. We load these into a Gensim Word2Vec model class. @@ -173,6 +168,7 @@ embeddings. We load these into a Gensim Word2Vec model class. The embeddings we have chosen here require a lot of memory. +.. GENERATED FROM PYTHON SOURCE LINES 101-104 .. code-block:: default @@ -183,11 +179,22 @@ embeddings. We load these into a Gensim Word2Vec model class. +.. rst-class:: sphx-glr-script-out + + .. code-block:: none + 2022-10-23 11:18:41,292 : INFO : loading projection weights from /home/thomas/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz + 2022-10-23 11:19:12,793 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from /home/thomas/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2022-10-23T11:19:12.755440', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'} + + + + +.. GENERATED FROM PYTHON SOURCE LINES 105-107 So let's compute WMD using the ``wmdistance`` method. +.. GENERATED FROM PYTHON SOURCE LINES 107-110 .. code-block:: default @@ -200,17 +207,22 @@ So let's compute WMD using the ``wmdistance`` method. .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none + 2022-10-23 11:19:12,860 : INFO : adding document #0 to Dictionary<0 unique tokens: []> + 2022-10-23 11:19:12,861 : INFO : built Dictionary<8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...> from 2 documents (total 8 corpus positions) + 2022-10-23 11:19:12,861 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...> from 2 documents (total 8 corpus positions)", 'datetime': '2022-10-23T11:19:12.861331', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'created'} distance = 1.0175 + +.. GENERATED FROM PYTHON SOURCE LINES 111-113 + Let's try the same thing with two completely unrelated sentences. Notice that the distance is larger. +.. GENERATED FROM PYTHON SOURCE LINES 113-117 .. code-block:: default @@ -224,50 +236,48 @@ Let's try the same thing with two completely unrelated sentences. Notice that th .. rst-class:: sphx-glr-script-out - Out: - .. code-block:: none - distance = 1.3663 + 2022-10-23 11:19:15,303 : INFO : adding document #0 to Dictionary<0 unique tokens: []> + 2022-10-23 11:19:15,304 : INFO : built Dictionary<7 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'favorite']...> from 2 documents (total 7 corpus positions) + 2022-10-23 11:19:15,304 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<7 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'favorite']...> from 2 documents (total 7 corpus positions)", 'datetime': '2022-10-23T11:19:15.304338', 'gensim': '4.2.1.dev0', 'python': '3.10.6 (main, Aug 10 2022, 11:40:04) [GCC 11.3.0]', 'platform': 'Linux-5.19.0-76051900-generic-x86_64-with-glibc2.35', 'event': 'created'} + distance = 1.3664 + +.. GENERATED FROM PYTHON SOURCE LINES 118-125 + References ---------- -1. Ofir Pele and Michael Werman, *A linear time histogram metric for improved SIFT matching*, 2008. -2. Ofir Pele and Michael Werman, *Fast and robust earth mover's distances*, 2009. -3. Matt Kusner et al. *From Embeddings To Document Distances*, 2015. -4. Tomáš Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013. +1. Rémi Flamary et al. *POT: Python Optimal Transport*, 2021. +2. Matt Kusner et al. *From Embeddings To Document Distances*, 2015. +3. Tomáš Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013. .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 0 minutes 55.983 seconds) + **Total running time of the script:** ( 0 minutes 36.418 seconds) -**Estimated memory usage:** 7537 MB +**Estimated memory usage:** 7551 MB .. _sphx_glr_download_auto_examples_tutorials_run_wmd.py: +.. only:: html -.. only :: html - - .. container:: sphx-glr-footer - :class: sphx-glr-footer-example - - - - .. container:: sphx-glr-download + .. container:: sphx-glr-footer sphx-glr-footer-example - :download:`Download Python source code: run_wmd.py ` + .. container:: sphx-glr-download sphx-glr-download-python + :download:`Download Python source code: run_wmd.py ` - .. container:: sphx-glr-download + .. container:: sphx-glr-download sphx-glr-download-jupyter - :download:`Download Jupyter notebook: run_wmd.ipynb ` + :download:`Download Jupyter notebook: run_wmd.ipynb ` .. only:: html diff --git a/docs/src/auto_examples/tutorials/run_wmd_codeobj.pickle b/docs/src/auto_examples/tutorials/run_wmd_codeobj.pickle new file mode 100644 index 0000000000..9df7517582 Binary files /dev/null and b/docs/src/auto_examples/tutorials/run_wmd_codeobj.pickle differ diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst index 0dfaf2783f..eb3d2c3c8c 100644 --- a/docs/src/auto_examples/tutorials/sg_execution_times.rst +++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst @@ -5,22 +5,21 @@ Computation times ================= -**04:13.971** total execution time for **auto_examples_tutorials** files: -+-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` (``run_lda.py``) | 04:13.971 | 664.3 MB | -+-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``) | 00:00.000 | 0.0 MB | -+-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` (``run_doc2vec_lee.py``) | 00:00.000 | 0.0 MB | -+-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 00:00.000 | 0.0 MB | -+-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` (``run_fasttext.py``) | 00:00.000 | 0.0 MB | -+-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_scm.py` (``run_scm.py``) | 00:00.000 | 0.0 MB | -+-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` (``run_wmd.py``) | 00:00.000 | 0.0 MB | -+-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` (``run_word2vec.py``) | 00:00.000 | 0.0 MB | -+-------------------------------------------------------------------------------------+-----------+----------+ ++-------------------------------------------------------------------------------------+-----------+---------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` (``run_doc2vec_lee.py``) | 00:16.509 | 48.4 MB | ++-------------------------------------------------------------------------------------+-----------+---------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``) | 00:00.000 | 0.0 MB | ++-------------------------------------------------------------------------------------+-----------+---------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 00:00.000 | 0.0 MB | ++-------------------------------------------------------------------------------------+-----------+---------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` (``run_fasttext.py``) | 00:00.000 | 0.0 MB | ++-------------------------------------------------------------------------------------+-----------+---------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` (``run_lda.py``) | 00:00.000 | 0.0 MB | ++-------------------------------------------------------------------------------------+-----------+---------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_scm.py` (``run_scm.py``) | 00:00.000 | 0.0 MB | ++-------------------------------------------------------------------------------------+-----------+---------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` (``run_wmd.py``) | 00:00.000 | 0.0 MB | ++-------------------------------------------------------------------------------------+-----------+---------+ +| :ref:`sphx_glr_auto_examples_tutorials_run_word2vec.py` (``run_word2vec.py``) | 00:00.000 | 0.0 MB | ++-------------------------------------------------------------------------------------+-----------+---------+ diff --git a/docs/src/conf.py b/docs/src/conf.py index 168d4cf58e..f6483f8544 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -61,9 +61,9 @@ # built documents. # # The short X.Y version. -version = '4.2.0' +version = '4.3' # The full version, including alpha/beta/rc tags. -release = '4.2.0' +release = '4.3.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/src/gallery/README.txt b/docs/src/gallery/README.txt index 80ab288c48..560c047c25 100644 --- a/docs/src/gallery/README.txt +++ b/docs/src/gallery/README.txt @@ -1,5 +1,7 @@ Documentation ============= +.. _gallery_top: + We welcome contributions to our documentation via GitHub pull requests, whether it's fixing a typo or authoring an entirely new tutorial or guide. If you're thinking about contributing documentation, please see :ref:`sphx_glr_auto_examples_howtos_run_doc.py`. diff --git a/docs/src/gallery/core/run_topics_and_transformations.py b/docs/src/gallery/core/run_topics_and_transformations.py index 605584084d..45888505e0 100644 --- a/docs/src/gallery/core/run_topics_and_transformations.py +++ b/docs/src/gallery/core/run_topics_and_transformations.py @@ -188,6 +188,20 @@ # # model = models.TfidfModel(corpus, normalize=True) # +# * `Okapi Best Matching, Okapi BM25 `_ +# expects a bag-of-words (integer values) training corpus during initialization. +# During transformation, it will take a vector and return another vector of the +# same dimensionality, except that features which were rare in the training corpus +# will have their value increased. It therefore converts integer-valued +# vectors into real-valued ones, while leaving the number of dimensions intact. +# +# Okapi BM25 is the standard ranking function used by search engines to estimate +# the relevance of documents to a given search query. +# +# .. sourcecode:: pycon +# +# model = models.OkapiBM25Model(corpus) +# # * `Latent Semantic Indexing, LSI (or sometimes LSA) `_ # transforms documents from either bag-of-words or (preferrably) TfIdf-weighted space into # a latent space of a lower dimensionality. For the toy corpus above we used only diff --git a/docs/src/gallery/tutorials/run_doc2vec_lee.py b/docs/src/gallery/tutorials/run_doc2vec_lee.py index 7012d38f66..18f4ee7b16 100644 --- a/docs/src/gallery/tutorials/run_doc2vec_lee.py +++ b/docs/src/gallery/tutorials/run_doc2vec_lee.py @@ -215,9 +215,15 @@ def read_corpus(fname, tokens_only=False): ############################################################################### # Next, train the model on the corpus. -# If optimized Gensim (with BLAS library) is being used, this should take no more than 3 seconds. -# If the BLAS library is not being used, this should take no more than 2 -# minutes, so use optimized Gensim with BLAS if you value your time. +# In the usual case, where Gensim installation found a BLAS library for optimized +# bulk vector operations, this training on this tiny 300 document, ~60k word corpus +# should take just a few seconds. (More realistic datasets of tens-of-millions +# of words or more take proportionately longer.) If for some reason a BLAS library +# isn't available, training uses a fallback approach that takes 60x-120x longer, +# so even this tiny training will take minutes rather than seconds. (And, in that +# case, you should also notice a warning in the logging letting you know there's +# something worth fixing.) So, be sure your installation uses the BLAS-optimized +# Gensim if you value your time. # model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs) diff --git a/docs/src/gallery/tutorials/run_fasttext.py b/docs/src/gallery/tutorials/run_fasttext.py index 5a03b5d35e..ac3a1bc4c3 100644 --- a/docs/src/gallery/tutorials/run_fasttext.py +++ b/docs/src/gallery/tutorials/run_fasttext.py @@ -248,7 +248,7 @@ # Word Movers distance # ^^^^^^^^^^^^^^^^^^^^ # -# You'll need the optional ``pyemd`` library for this section, ``pip install pyemd``. +# You'll need the optional ``POT`` library for this section, ``pip install POT``. # # Let's start with two sentences: sentence_obama = 'Obama speaks to the media in Illinois'.lower().split() diff --git a/docs/src/gallery/tutorials/run_wmd.py b/docs/src/gallery/tutorials/run_wmd.py index 06e263063a..c037ef697b 100644 --- a/docs/src/gallery/tutorials/run_wmd.py +++ b/docs/src/gallery/tutorials/run_wmd.py @@ -53,7 +53,7 @@ # ``WmdSimilarity`` class for corpus based similarity queries. # # .. Important:: -# If you use Gensim's WMD functionality, please consider citing [1], [2] and [3]. +# If you use Gensim's WMD functionality, please consider citing [1] and [2]. # # Computing the Word Mover's Distance # ----------------------------------- @@ -118,8 +118,7 @@ def preprocess(sentence): # References # ---------- # -# 1. Ofir Pele and Michael Werman, *A linear time histogram metric for improved SIFT matching*, 2008. -# 2. Ofir Pele and Michael Werman, *Fast and robust earth mover's distances*, 2009. -# 3. Matt Kusner et al. *From Embeddings To Document Distances*, 2015. -# 4. Tomáš Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013. +# 1. Rémi Flamary et al. *POT: Python Optimal Transport*, 2021. +# 2. Matt Kusner et al. *From Embeddings To Document Distances*, 2015. +# 3. Tomáš Mikolov et al. *Efficient Estimation of Word Representations in Vector Space*, 2013. # diff --git a/docs/src/sphinx_rtd_theme/notification.html b/docs/src/sphinx_rtd_theme/notification.html index 4b0d7922e1..dd648b861f 100644 --- a/docs/src/sphinx_rtd_theme/notification.html +++ b/docs/src/sphinx_rtd_theme/notification.html @@ -1,6 +1,3 @@
- You're viewing documentation for Gensim 4.0.0. For Gensim 3.8.3, please visit the old Gensim 3.8.3 documentation and Migration Guide. - + Gensim relies on your donations for sustenance. If you like Gensim, please consider donating.
diff --git a/docs/src/support.rst b/docs/src/support.rst index a28f1e0003..514b75b9b1 100644 --- a/docs/src/support.rst +++ b/docs/src/support.rst @@ -11,7 +11,7 @@ Open source support The main communication channel is the free `Gensim mailing list `_. -This is the preferred way to ask for help, report problems and share insights with the community. Newbie questions are perfectly fine, as long as you've read the :ref:`tutorials ` and `FAQ `_. +This is the preferred way to ask for help, report problems and share insights with the community. Newbie questions are perfectly fine, as long as you've read the :ref:`tutorials ` and `FAQ `_. FAQ and some useful snippets of code are maintained on GitHub: https://github.com/RARE-Technologies/gensim/wiki/Recipes-&-FAQ. diff --git a/gensim/__init__.py b/gensim/__init__.py index b5f915a3ed..e7c59b6bd6 100644 --- a/gensim/__init__.py +++ b/gensim/__init__.py @@ -4,7 +4,7 @@ """ -__version__ = '4.2.0' +__version__ = '4.3.0' import logging diff --git a/gensim/_matutils.pyx b/gensim/_matutils.pyx index 6c79020832..0162202224 100644 --- a/gensim/_matutils.pyx +++ b/gensim/_matutils.pyx @@ -1,6 +1,7 @@ #!/usr/bin/env cython # coding: utf-8 # cython: embedsignature=True +# cython: language_level=3 from __future__ import division cimport cython diff --git a/gensim/corpora/_mmreader.pyx b/gensim/corpora/_mmreader.pyx index 3c32797de8..16edc070c8 100644 --- a/gensim/corpora/_mmreader.pyx +++ b/gensim/corpora/_mmreader.pyx @@ -1,5 +1,6 @@ # Copyright (C) 2018 Radim Rehurek # cython: embedsignature=True +# cython: language_level=3 """Reader for corpus in the Matrix Market format.""" import logging diff --git a/gensim/matutils.py b/gensim/matutils.py index 4d4064acc0..fdd1a6b592 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -17,9 +17,8 @@ import numpy as np import scipy.sparse from scipy.stats import entropy -import scipy.linalg +from scipy.linalg import get_blas_funcs, triu from scipy.linalg.lapack import get_lapack_funcs -from scipy.linalg.special_matrices import triu from scipy.special import psi # gamma function utils @@ -42,7 +41,7 @@ def blas(name, ndarray): BLAS function for the needed operation on the given data type. """ - return scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] + return get_blas_funcs((name,), (ndarray,))[0] def argsort(x, topn=None, reverse=False): diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index ac08d1fdb4..d9a28ead34 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -9,6 +9,7 @@ from .ldamodel import LdaModel # noqa:F401 from .lsimodel import LsiModel # noqa:F401 from .tfidfmodel import TfidfModel # noqa:F401 +from .bm25model import OkapiBM25Model, LuceneBM25Model, AtireBM25Model # noqa:F401 from .rpmodel import RpModel # noqa:F401 from .logentropy_model import LogEntropyModel # noqa:F401 from .word2vec import Word2Vec, FAST_VERSION # noqa:F401 diff --git a/gensim/models/bm25model.py b/gensim/models/bm25model.py new file mode 100644 index 0000000000..265afb3af9 --- /dev/null +++ b/gensim/models/bm25model.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""This module implements functionality related to the `Okapi Best Matching +`_ class of bag-of-words vector space models. + +Robertson and Zaragoza [1]_ describe the original algorithm and its modifications. + +.. [1] Robertson S., Zaragoza H. (2015). `The Probabilistic Relevance Framework: BM25 and + Beyond, `_. + +""" + +from abc import ABCMeta, abstractmethod +from collections import defaultdict +import logging +import math + +from gensim import interfaces, utils +import numpy as np + + +logger = logging.getLogger(__name__) + + +class BM25ABC(interfaces.TransformationABC, metaclass=ABCMeta): + """Objects of this abstract class realize the transformation between word-document co-occurrence + matrix (int) into a BM25 matrix (positive floats). Concrete subclasses of this abstract class + implement different BM25 scoring functions. + + """ + def __init__(self, corpus=None, dictionary=None): + r"""Pre-compute the average length of a document and inverse term document frequencies, + which will be used to weight term frequencies for the documents. + + Parameters + ---------- + corpus : iterable of iterable of (int, int) or None, optional + An input corpus, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `dictionary` will be used to compute + the statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + dictionary : :class:`~gensim.corpora.Dictionary` + An input dictionary, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `corpus` will be used to compute the + statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + + Attributes + ---------- + avgdl : float + The average length of a document. + idfs : dict of (int, float) + A mapping from term ids to inverse term document frequencies. + + """ + self.avgdl, self.idfs = None, None + if dictionary: + if corpus: + logger.warning("constructor received both corpus and dictionary; ignoring the corpus") + num_tokens = sum(dictionary.cfs.values()) + self.avgdl = num_tokens / dictionary.num_docs + self.idfs = self.precompute_idfs(dictionary.dfs, dictionary.num_docs) + elif corpus: + dfs = defaultdict(lambda: 0) + num_tokens = 0 + num_docs = 0 + for bow in corpus: + num_tokens += len(bow) + for term_id in set(term_id for term_id, _ in bow): + dfs[term_id] += 1 + num_docs += 1 + self.avgdl = num_tokens / num_docs + self.idfs = self.precompute_idfs(dfs, num_docs) + else: + pass + + @abstractmethod + def precompute_idfs(self, dfs, num_docs): + """Precompute inverse term document frequencies, which will be used to weight term frequencies + for the documents. + + Parameters + ---------- + dfs : dict of (int, int) + A mapping from term ids to term document frequencies. + num_docs : int + The total number of documents in the training corpus. + + Returns + ------- + idfs : dict of (int, float) + A mapping from term ids to inverse term document frequencies. + + """ + pass + + @abstractmethod + def get_term_weights(self, num_tokens, term_frequencies, idfs): + """Compute vector space weights for a set of terms in a document. + + Parameters + ---------- + num_tokens : int + The number of tokens in the document. + term_frequencies : ndarray + 1D array of term frequencies. + idfs : ndarray + 1D array of inverse term document frequencies. + + Returns + ------- + term_weights : ndarray + 1D array of vector space weights. + + """ + pass + + def __getitem__(self, bow): + is_corpus, bow = utils.is_corpus(bow) + if is_corpus: + return self._apply(bow) + + num_tokens = sum(freq for term_id, freq in bow) + + term_ids, term_frequencies, idfs = [], [], [] + for term_id, term_frequency in bow: + term_ids.append(term_id) + term_frequencies.append(term_frequency) + idfs.append(self.idfs.get(term_id) or 0.0) + term_frequencies, idfs = np.array(term_frequencies), np.array(idfs) + + term_weights = self.get_term_weights(num_tokens, term_frequencies, idfs) + + vector = [ + (term_id, float(weight)) + for term_id, weight + in zip(term_ids, term_weights) + ] + return vector + + +class OkapiBM25Model(BM25ABC): + """The original Okapi BM25 scoring function of Robertson et al. [2]_. + + Examples + -------- + .. sourcecode:: pycon + + >>> from gensim.corpora import Dictionary + >>> from gensim.models import OkapiBM25Model + >>> from gensim.test.utils import common_texts + >>> + >>> dictionary = Dictionary(common_texts) # fit dictionary + >>> model = OkapiBM25Model(dictionary=dictionary) # fit model + >>> + >>> corpus = [dictionary.doc2bow(line) for line in common_texts] # convert corpus to BoW format + >>> vector = model[corpus[0]] # apply model to the first corpus document + + References + ---------- + .. [2] Robertson S. E., Walker S., Jones S., Hancock-Beaulieu M. M., Gatford M. (1995). + `Okapi at TREC-3 `_. + *NIST Special Publication 500-226*. + + """ + def __init__(self, corpus=None, dictionary=None, k1=1.5, b=0.75, epsilon=0.25): + r"""Pre-compute the average length of a document and inverse term document frequencies, + which will be used to weight term frequencies for the documents. + + Parameters + ---------- + corpus : iterable of iterable of (int, int) or None, optional + An input corpus, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `dictionary` will be used to compute + the statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + dictionary : :class:`~gensim.corpora.Dictionary` + An input dictionary, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `corpus` will be used to compute the + statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + k1 : float + A positive tuning parameter that determines the impact of the term frequency on its BM25 + weight. Singhal [5]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5. + b : float + A tuning parameter between 0.0 and 1.0 that determines the document length + normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to + no length normalization. Singhal [5]_ suggests to set `b` to 0.75, which is the default. + epsilon : float + A positive tuning parameter that lower-bounds an inverse document frequency. + Defaults to 0.25. + + Attributes + ---------- + k1 : float + A positive tuning parameter that determines the impact of the term frequency on its BM25 + weight. Singhal [3]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5. + b : float + A tuning parameter between 0.0 and 1.0 that determines the document length + normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to + no length normalization. Singhal [3]_ suggests to set `b` to 0.75, which is the default. + epsilon : float + A positive tuning parameter that lower-bounds an inverse document frequency. + Defaults to 0.25. + + References + ---------- + .. [3] Singhal, A. (2001). `Modern information retrieval: A brief overview + `_. *IEEE Data Eng. Bull.*, 24(4), 35–43. + + """ + self.k1, self.b, self.epsilon = k1, b, epsilon + super().__init__(corpus, dictionary) + + def precompute_idfs(self, dfs, num_docs): + idf_sum = 0 + idfs = dict() + negative_idfs = [] + for term_id, freq in dfs.items(): + idf = math.log(num_docs - freq + 0.5) - math.log(freq + 0.5) + idfs[term_id] = idf + idf_sum += idf + if idf < 0: + negative_idfs.append(term_id) + average_idf = idf_sum / len(idfs) + + eps = self.epsilon * average_idf + for term_id in negative_idfs: + idfs[term_id] = eps + + return idfs + + def get_term_weights(self, num_tokens, term_frequencies, idfs): + term_weights = idfs * (term_frequencies * (self.k1 + 1) + / (term_frequencies + self.k1 * (1 - self.b + self.b + * num_tokens / self.avgdl))) + return term_weights + + +class LuceneBM25Model(BM25ABC): + """The scoring function of Apache Lucene 8 [4]_. + + Examples + -------- + .. sourcecode:: pycon + + >>> from gensim.corpora import Dictionary + >>> from gensim.models import LuceneBM25Model + >>> from gensim.test.utils import common_texts + >>> + >>> dictionary = Dictionary(common_texts) # fit dictionary + >>> corpus = [dictionary.doc2bow(line) for line in common_texts] # convert corpus to BoW format + >>> + >>> model = LuceneBM25Model(dictionary=dictionary) # fit model + >>> vector = model[corpus[0]] # apply model to the first corpus document + + References + ---------- + .. [4] Kamphuis, C., de Vries, A. P., Boytsov, L., Lin, J. (2020). Which + BM25 Do You Mean? `A Large-Scale Reproducibility Study of Scoring Variants + `_. In: Advances in Information Retrieval. + 28–34. + + """ + def __init__(self, corpus=None, dictionary=None, k1=1.5, b=0.75): + r"""Pre-compute the average length of a document and inverse term document frequencies, + which will be used to weight term frequencies for the documents. + + Parameters + ---------- + corpus : iterable of iterable of (int, int) or None, optional + An input corpus, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `dictionary` will be used to compute + the statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + dictionary : :class:`~gensim.corpora.Dictionary` + An input dictionary, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `corpus` will be used to compute the + statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + k1 : float + A positive tuning parameter that determines the impact of the term frequency on its BM25 + weight. Singhal [5]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5. + b : float + A tuning parameter between 0.0 and 1.0 that determines the document length + normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to + no length normalization. Singhal [5]_ suggests to set `b` to 0.75, which is the default. + + Attributes + ---------- + k1 : float + A positive tuning parameter that determines the impact of the term frequency on its BM25 + weight. Singhal [3]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5. + b : float + A tuning parameter between 0.0 and 1.0 that determines the document length + normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to + no length normalization. Singhal [3]_ suggests to set `b` to 0.75, which is the default. + + """ + self.k1, self.b = k1, b + super().__init__(corpus, dictionary) + + def precompute_idfs(self, dfs, num_docs): + idfs = dict() + for term_id, freq in dfs.items(): + idf = math.log(num_docs + 1.0) - math.log(freq + 0.5) + idfs[term_id] = idf + return idfs + + def get_term_weights(self, num_tokens, term_frequencies, idfs): + term_weights = idfs * (term_frequencies + / (term_frequencies + self.k1 * (1 - self.b + self.b + * num_tokens / self.avgdl))) + return term_weights + + +class AtireBM25Model(BM25ABC): + """The scoring function of Trotman et al. [5]_. + + Examples + -------- + .. sourcecode:: pycon + + >>> from gensim.corpora import Dictionary + >>> from gensim.models import AtireBM25Model + >>> from gensim.test.utils import common_texts + >>> + >>> dictionary = Dictionary(common_texts) # fit dictionary + >>> corpus = [dictionary.doc2bow(line) for line in common_texts] # convert corpus to BoW format + >>> + >>> model = AtireBM25Model(dictionary=dictionary) # fit model + >>> vector = model[corpus[0]] # apply model to the first corpus document + + References + ---------- + .. [5] Trotman, A., Jia X., Crane M., `Towards an Efficient and Effective Search Engine + `_, + In: SIGIR 2012 Workshop on Open Source Information Retrieval. 40–47. + + """ + def __init__(self, corpus=None, dictionary=None, k1=1.5, b=0.75): + r"""Pre-compute the average length of a document and inverse term document frequencies, + which will be used to weight term frequencies for the documents. + + Parameters + ---------- + corpus : iterable of iterable of (int, int) or None, optional + An input corpus, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `dictionary` will be used to compute + the statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + dictionary : :class:`~gensim.corpora.Dictionary` + An input dictionary, which will be used to compute the average length of a document and + inverse term document frequencies. If None, then `corpus` will be used to compute the + statistics. If both `corpus` and `dictionary` are None, the statistics will be left + unintialized. Default is None. + k1 : float + A positive tuning parameter that determines the impact of the term frequency on its BM25 + weight. Singhal [5]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5. + b : float + A tuning parameter between 0.0 and 1.0 that determines the document length + normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to + no length normalization. Singhal [5]_ suggests to set `b` to 0.75, which is the default. + + Attributes + ---------- + k1 : float + A positive tuning parameter that determines the impact of the term frequency on its BM25 + weight. Singhal [3]_ suggests to set `k1` between 1.0 and 2.0. Default is 1.5. + b : float + A tuning parameter between 0.0 and 1.0 that determines the document length + normalization: 1.0 corresponds to full document normalization, while 0.0 corresponds to + no length normalization. Singhal [3]_ suggests to set `b` to 0.75, which is the default. + + """ + self.k1, self.b = k1, b + super().__init__(corpus, dictionary) + + def precompute_idfs(self, dfs, num_docs): + idfs = dict() + for term_id, freq in dfs.items(): + idf = math.log(num_docs) - math.log(freq) + idfs[term_id] = idf + return idfs + + def get_term_weights(self, num_tokens, term_frequencies, idfs): + term_weights = idfs * (term_frequencies * (self.k1 + 1) + / (term_frequencies + self.k1 * (1 - self.b + self.b + * num_tokens / self.avgdl))) + return term_weights diff --git a/gensim/models/callbacks.py b/gensim/models/callbacks.py index 42f250cb91..c5560441af 100644 --- a/gensim/models/callbacks.py +++ b/gensim/models/callbacks.py @@ -234,9 +234,7 @@ def get_value(self, **kwargs): Key word arguments to override the object's internal attributes. One of the following parameters are expected: - * `model` - pre-trained topic model of type :class:`~gensim.models.ldamodel.LdaModel`, or one - of its wrappers, such as :class:`~gensim.models.wrappers.ldamallet.LdaMallet` or - :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`. + * `model` - pre-trained topic model of type :class:`~gensim.models.ldamodel.LdaModel`. * `topics` - list of tokenized topics. Returns @@ -290,10 +288,8 @@ def get_value(self, **kwargs): ---------- **kwargs Key word arguments to override the object's internal attributes. - A trained topic model is expected using the 'model' key. This can be of type - :class:`~gensim.models.ldamodel.LdaModel`, or one of its wrappers, such as - :class:`~gensim.models.wrappers.ldamallet.LdaMallet` or - :class:`~gensim.models.wrapper.ldavowpalwabbit.LdaVowpalWabbit`. + A trained topic model is expected using the 'model' key. + This must be of type :class:`~gensim.models.ldamodel.LdaModel`. Returns ------- @@ -354,8 +350,8 @@ def get_value(self, **kwargs): ---------- **kwargs Key word arguments to override the object's internal attributes. - Two models of type :class:`~gensim.models.ldamodelLdaModel` or its wrappers are expected using the keys - `model` and `other_model`. + Two models of type :class:`~gensim.models.ldamodelLdaModel` + are expected using the keys `model` and `other_model`. Returns ------- @@ -424,8 +420,8 @@ def get_value(self, **kwargs): ---------- **kwargs Key word arguments to override the object's internal attributes. - Two models of type :class:`~gensim.models.ldamodel.LdaModel` or its wrappers are expected using the keys - `model` and `other_model`. + Two models of type :class:`~gensim.models.ldamodel.LdaModel` + are expected using the keys `model` and `other_model`. Returns ------- diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index b3c89640a7..d6df976153 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -132,8 +132,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= model : :class:`~gensim.models.basemodel.BaseTopicModel`, optional Pre-trained topic model, should be provided if topics is not provided. Currently supports :class:`~gensim.models.ldamodel.LdaModel`, - :class:`~gensim.models.ldamulticore.LdaMulticore`, :class:`~gensim.models.wrappers.ldamallet.LdaMallet` and - :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`. + :class:`~gensim.models.ldamulticore.LdaMulticore`. Use `topics` parameter to plug in an as yet unsupported model. topics : list of list of str, optional List of tokenized topics, if this is preferred over model - dictionary should be provided. diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 20a739f64a..d76be81f9b 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -1137,7 +1137,8 @@ def __iter__(self): class TaggedLineDocument: def __init__(self, source): - """Iterate over a file that contains documents: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object. + """Iterate over a file that contains documents: + one line = :class:`~gensim.models.doc2vec.TaggedDocument` object. Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed automatically from the document line number (each document gets a unique integer tag). diff --git a/gensim/models/doc2vec_corpusfile.pyx b/gensim/models/doc2vec_corpusfile.pyx index 9216d13bd4..50a07fc3ab 100644 --- a/gensim/models/doc2vec_corpusfile.pyx +++ b/gensim/models/doc2vec_corpusfile.pyx @@ -1,4 +1,5 @@ #!/usr/bin/env cython +# cython: language_level=3 # cython: boundscheck=False # cython: wraparound=False # cython: cdivision=True diff --git a/gensim/models/doc2vec_inner.pxd b/gensim/models/doc2vec_inner.pxd index 77da86f449..41635b47a0 100644 --- a/gensim/models/doc2vec_inner.pxd +++ b/gensim/models/doc2vec_inner.pxd @@ -1,5 +1,6 @@ #!/usr/bin/env cython # distutils: language = c++ +# cython: language_level=3 # cython: boundscheck=False # cython: wraparound=False # cython: cdivision=True @@ -15,7 +16,7 @@ import numpy as np cimport numpy as np -from word2vec_inner cimport REAL_t +from gensim.models.word2vec_inner cimport REAL_t DEF MAX_DOCUMENT_LEN = 10000 diff --git a/gensim/models/doc2vec_inner.pyx b/gensim/models/doc2vec_inner.pyx index 1657c59787..804a3ac28d 100644 --- a/gensim/models/doc2vec_inner.pyx +++ b/gensim/models/doc2vec_inner.pyx @@ -1,4 +1,5 @@ #!/usr/bin/env cython +# cython: language_level=3 # cython: boundscheck=False # cython: wraparound=False # cython: cdivision=True @@ -24,7 +25,7 @@ except ImportError: # in scipy > 0.15, fblas function has been removed import scipy.linalg.blas as fblas -from word2vec_inner cimport bisect_left, random_int32, sscal, REAL_t, EXP_TABLE, our_dot, our_saxpy +from gensim.models.word2vec_inner cimport bisect_left, random_int32, sscal, REAL_t, EXP_TABLE, our_dot, our_saxpy DEF MAX_DOCUMENT_LEN = 10000 diff --git a/gensim/models/ensemblelda.py b/gensim/models/ensemblelda.py index 39d7e06620..16b5c1c1bc 100644 --- a/gensim/models/ensemblelda.py +++ b/gensim/models/ensemblelda.py @@ -91,7 +91,7 @@ Citation -------- BRIGL, Tobias, 2019, Extracting Reliable Topics using Ensemble Latent Dirichlet Allocation [Bachelor Thesis]. -Technische Hochschule Ingolstadt. Munich: Data Reply GmbH. Available from: +Technische Hochschule Ingolstadt. Munich: Data Reply GmbH. Supervised by Alex Loosley. Available from: https://www.sezanzeb.de/machine_learning/ensemble_LDA/ """ diff --git a/gensim/models/fasttext_corpusfile.pyx b/gensim/models/fasttext_corpusfile.pyx index 5d275b42b6..2b5344e2d5 100644 --- a/gensim/models/fasttext_corpusfile.pyx +++ b/gensim/models/fasttext_corpusfile.pyx @@ -1,5 +1,6 @@ #!/usr/bin/env cython # distutils: language = c++ +# cython: language_level=3 # cython: boundscheck=False # cython: wraparound=False # cython: cdivision=True diff --git a/gensim/models/fasttext_inner.pxd b/gensim/models/fasttext_inner.pxd index 31a1b1d35f..af7a531116 100644 --- a/gensim/models/fasttext_inner.pxd +++ b/gensim/models/fasttext_inner.pxd @@ -1,4 +1,5 @@ #!/usr/bin/env cython +# cython: language_level=3 # cython: boundscheck=False # cython: wraparound=False # cython: cdivision=True @@ -13,7 +14,7 @@ import numpy as np cimport numpy as np -from word2vec_inner cimport REAL_t +from gensim.models.word2vec_inner cimport REAL_t DEF MAX_SENTENCE_LEN = 10000 diff --git a/gensim/models/fasttext_inner.pyx b/gensim/models/fasttext_inner.pyx index e27bd62feb..6e246b3579 100644 --- a/gensim/models/fasttext_inner.pyx +++ b/gensim/models/fasttext_inner.pyx @@ -1,4 +1,5 @@ #!/usr/bin/env cython +# cython: language_level=3 # cython: boundscheck=False # cython: wraparound=False # cython: cdivision=True @@ -55,7 +56,7 @@ from libc.string cimport memset # # The versions are as chosen in word2vec_inner.pyx, and aliased to `our_` functions -from word2vec_inner cimport bisect_left, random_int32, scopy, sscal, \ +from gensim.models.word2vec_inner cimport bisect_left, random_int32, scopy, sscal, \ REAL_t, our_dot, our_saxpy DEF MAX_SENTENCE_LEN = 10000 diff --git a/gensim/models/flsamodel.py b/gensim/models/flsamodel.py new file mode 100644 index 0000000000..9ed815abe1 --- /dev/null +++ b/gensim/models/flsamodel.py @@ -0,0 +1,1696 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Oct 27 11:04:27 2022 + +@author: 20200016 +""" + +import math +from collections import Counter +import warnings +import pickle +import itertools +import numpy as np +from scipy.sparse.linalg import svds +from scipy.sparse import dok_matrix +from pyfume import Clustering +import gensim.corpora as corpora +from gensim.models.coherencemodel import CoherenceModel +from gensim.models import Word2Vec + + +class FlsaModel(): + """ + Class to initialize and train fuzzy topic models with methods similar + to Gensim's LdaModel' + + Parameters + ---------- + corpus : The input corpus. + either: list of list of str. + or: list of list of tuples (int, int) (bow). + + num_topics: int + The number of topics to be trained. + + algorithm: str ['flsa', 'flsa-w', 'flsa-e'] + The algorithm to train. + + id2word: gensim.corpora.dictionary.Dictionary + Object to map id's to words + (only used when the corpus is passed into the object as a bow). + + word_weighting: str ['normal', 'idf', 'probidf', 'entropy'] + Global term weighting mechanism. + + cluster_method: str ['fcm', 'gk', 'fst-pso'] + Fuzzy clustering method. + + svd_factors: int + The number of singular values to use. + """ + def __init__( + self, + corpus, + num_topics, + algorithm, + num_words=20, + word_weighting='normal', + cluster_method='fcm', + svd_factors=2, + id2word=None, + min_count=None, + window=None, + vector_size=None, + workers=None, + ): + self.corpus = self._set_corpus(corpus, id2word) + self.num_topics = num_topics + self.algorithm = algorithm + self.num_topics = num_topics + self.num_words = num_words + self.word_weighting = word_weighting + self.cluster_method = cluster_method + self.svd_factors = svd_factors + self.min_count = min_count + self.window = window + self.vector_size = vector_size + self.workers = workers + self._check_variables() + self._vocabulary, self._vocabulary_size = self._create_vocabulary(self.corpus) + self._word_to_index, self._index_to_word = self._create_index_dicts(self._vocabulary) + self._sum_words = self._create_sum_words(self.corpus) + self._prob_word_i = None + self._prob_document_j = None + self._prob_topic_k = None + self._prob_word_given_topic = None + self._prob_word_given_document = None + self.coherence_score = None + self.diversity_score = None + self.pwgt, self.ptgd = self._get_matrices() + + def _set_corpus( + self, + corpus, + id2word, + ): + """ + Method that sets the corpus to FuzzyTM's required input format. + If a list of list of str is passed into the method for corpus, then + it returns the same corpus. If a bow (list of list of tuples) is passed + into the class, it transforms this into a list of list of str. + + Parameters + ---------- + corpus : either: list of list of str (tokens). or: list of list of tuples (int, int). + The input corpus. + id2word: gensim.corpora.dictionary.Dictionary + Object to map id's to words + (only used when the corpus is passed into the object as a bow) + + Returns + ------- + list of list of str + The corpus in FuzzyTM's required input format. + """ + if self._check_bow(corpus): + if not isinstance(id2word, corpora.dictionary.Dictionary): + raise ValueError("Please pass 'id2word' when using a bow for 'corpus'.") + return self._convert_bow(corpus, id2word) + return corpus + + @staticmethod + def _check_bow( + corpus, + ): + """ + Method to check if the input format has the bow format. + + Parameters + ---------- + corpus : either: list of list of str (tokens). or: list of list of tuples (int, int). + The input corpus. + + Returns + ------- + bool + True if bow format + """ + if not isinstance(corpus, list): + return False + for doc in corpus: + if not isinstance(doc, list): + return False + for tup in doc: + if not isinstance(tup, tuple): + return False + if not isinstance(tup[0], int) or not isinstance(tup[1], int): + return False + return True + + @staticmethod + def _convert_bow( + corpus, + id2word, + ): + """ + Method to convert the bow format into a list of list of str. + + Parameters + ---------- + corpus : The input corpus. + either: list of list of str (tokens). + or: list of list of tuples (int, int). + + id2word: gensim.corpora.dictionary.Dictionary + Object to map id's to words + + Returns + ------- + list of list of str + The corpus in FuzzyTM's required input format. + """ + id2token = {v: k for k, v in id2word.token2id.items()} + data_list = [] + for doc in corpus: + doc_list = [] + for tup in doc: + for _ in itertools.repeat(None, tup[1]): + doc_list.append(id2token[tup[0]]) + data_list.append(doc_list) + return data_list + + def _check_variables(self): + """ + Check whether the input data has the right format. + + Correct format: list of list of str (tokens) + The function raises an error if the format is incorrect. + """ + for i, doc in enumerate(self.corpus): + if not isinstance(doc, list): + raise TypeError("corpus variable at index ", + str(i), + " is not a list") + if not len(doc) > 0: + raise ValueError( + "The corpus has an empty list at index ", + str(i), + " and should contain at least one str value") + for j, word in enumerate(doc): + if not isinstance(word, str): + raise TypeError(f"Word {j} of document {i} is not a str") + if not isinstance(self.num_topics, int) or self.num_topics < 1: + raise ValueError("Please use a positive int for num_topics") + if not isinstance(self.num_words, int) or self.num_words < 1: + raise ValueError("Please use a positive int for num_words") + if self.algorithm in [ + "flsa", + "flsa-w", + ] and self.word_weighting not in [ + "entropy", + "idf", + "normal", + "probidf", + ]: + warning = ["Invalid word weighting method", + "Please choose between:", + "'entropy', 'idf', 'normal' and'probidf'", + ] + raise ValueError(' '.join(warning)) + if self.cluster_method not in [ + "fcm", + "fst-pso", + "gk", + ]: + raise ValueError( + "Invalid 'cluster_method. Please choose: 'fcm', 'fst-pso' or 'gk'") + if not isinstance(self.svd_factors, int) and self.svd_factors > 0: + raise ValueError("Please use a positive int for svd_factors") + if self.algorithm not in [ + 'flsa', + 'flsa-w', + 'flsa-e', + ]: + raise ValueError('Please select a correct "algoritm"') + + @staticmethod + def _create_vocabulary(corpus): + """ + Create the vocabulary from 'corpus'. + + Parameters + ---------- + corpus : list of lists of str + The input file used to initialize the model. + + Returns + ------- + set of str + All the vocabulary words. + """ + vocabulary = set(el for lis in corpus for el in lis) + return vocabulary, len(vocabulary) + + @staticmethod + def _create_index_dicts(vocabulary): + """ + Create the dictionaries with mappings between words and indices. + + Parameters + ---------- + vocabulary : set of str + All the words in the corpus. + + Returns + ------- + dict of {str : int} + Dictionary that maps a vocabulary word to and index number. + dict of {int : str} + Dictionary that maps an index number to each vocabulary word. + """ + if not isinstance(vocabulary, set): + raise ValueError("Please use a 'set' type for 'vocabulary'.") + word_to_index = dict() + index_to_word = dict() + for i, word in enumerate(vocabulary): + word_to_index[word] = i + index_to_word[i] = word + return word_to_index, index_to_word + + @staticmethod + def _create_sum_words(corpus): + """ + Creates a Counter object that stores the count of each word in the corpus (corpus). + + Parameters + ---------- + corpus : list of lists of str + The input file used to initialize the model. + + Returns + ------- + collections.Counter {str : int} + The count of each word in the corpus. + """ + sum_words = Counter() + for document in corpus: + sum_words.update(Counter(document)) + return sum_words + + @staticmethod + def _create_sparse_local_term_weights( + corpus, + vocabulary_size, + word_to_index, + ): + """ + Creates a sparse matrix showing the frequency of each words in documents. + + (See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.dok_matrix.html) + Axes: + rows: documents (size: number of documents in corpus) + columns: words (size: vocabulary length) + + Parameters + ---------- + corpus : list of lists of str + The input file used to initialize the model. + vocabulary_size : int + Number of unique words in the corpus. + word_to_index: dict {str : int} + Maps each unique vocabulary word to a unique index number. + + Returns + ------- + scipy.sparse.dok_matrix + sparse matrix representation of the local term weights. + """ + sparse_local_term_weights = dok_matrix( + (len(corpus), + vocabulary_size), + dtype=np.float32, + ) + for document_index, document in enumerate(corpus): + document_counter = Counter(document) + for word in document_counter.keys(): + sparse_local_term_weights[ + document_index, word_to_index[word], + ] = document_counter[word] + return sparse_local_term_weights + + def _create_sparse_global_term_weights( + self, + corpus, + word_weighting, + vocabulary_size=None, + sparse_local_term_weights=None, + index_to_word=None, + word_to_index=None, + sum_words=None, + ): + """ + Apply a word_weighting method on the sparse_local_term_weights + to create sparse_global_term_weights. + (See: https://link.springer.com/article/10.1007/s40815-017-0327-9) + + Parameters + ---------- + corpus : list of lists of str + The input file used to initialize the model. + word_weighting : str + Indicates the method used for word_weighting. Choose from: + - entropy + - normal + - idf + - probidf + vocabulary_size : int + Number of unique words in the corpus. + sparse_local_term_weights : scipy.sparse.dok_matrix + A sparse matrix showing the frequency of each words in documents. + word_to_index : dict {str : int} + Maps each unique vocabulary word to a unique index number. + index_to_word : dict {int : str} + Maps each unique index number to a unique vocabulary word. + sum_words : collections.Counter {str : int} + The count of each word in the corpus. + + Returns + ------- + scipy.sparse.dok_matrix + sparse matrix representation of the global term weights. + """ + num_documents = len(corpus) + if word_weighting in ['entropy', 'normal']: + if sparse_local_term_weights is None: + raise ValueError("Please feed the algorithm 'sparse_local_term_weights'") + if word_weighting in ['entropy']: + if index_to_word is None: + raise ValueError("Please feed the algorithm 'index_to_word'") + if sum_words is None: + raise ValueError("Please feed the algorithm 'sum_words'") + if word_weighting in ['entropy', 'idf', 'probidf']: + if vocabulary_size is None: + raise ValueError("Please feed the algorithm 'vocabulary_size'") + if word_weighting in ['idf', 'probidf']: + if word_to_index is None: + raise ValueError("Please feed the algorithm 'word_to_index'") + if word_weighting == 'entropy': + global_term_weights = self._calculate_entropy( + num_documents, + vocabulary_size, + sparse_local_term_weights, + index_to_word, sum_words, + ) + elif word_weighting == 'idf': + global_term_weights = self._calculate_idf( + num_documents, + vocabulary_size, + corpus, + word_to_index, + ) + elif word_weighting == 'normal': + global_term_weights = self._calculate_normal(sparse_local_term_weights) + elif word_weighting == 'probidf': + global_term_weights = self._calculate_probidf( + num_documents, + vocabulary_size, + corpus, + word_to_index, + ) + else: + raise ValueError('Invalid word weighting method') + return sparse_local_term_weights.multiply(global_term_weights).tocsc() + + def _calculate_entropy( + self, + num_documents, + vocabulary_size, + sparse_local_term_weights, + index_to_word, + sum_words, + ): + """ + Use the entropy word weighting method. + + (See: https://link.springer.com/article/10.1007/s40815-017-0327-9) + + Parameters + ---------- + num_documents : int + The number of documents in the corpus. + vocabulary_size : int + Number of unique words in the corpus. + sparse_local_term_weights : scipy.sparse.dok_matrix + A sparse matrix showing the frequency of each words in documents. + index_to_word : dict {int : str} + Maps each unique index number to a unique vocabulary word. + sum_words : collections.Counter {str : int} + The count of each word in the corpus. + + Returns + ------- + numpy.array : float + """ + p_log_p_ij = self._create_p_log_p_ij( + num_documents, + vocabulary_size, + sparse_local_term_weights, + index_to_word, + sum_words, + ) + summed_p_log_p = p_log_p_ij.sum(0).tolist()[0] + return np.array([1 + summed_p_log_p_i / np.log2(num_documents) for summed_p_log_p_i in summed_p_log_p]) + + def _calculate_idf( + self, + num_documents, + vocabulary_size, + corpus, + word_to_index, + ): + """ + Use the idf word weightingg method. + + (See: https://link.springer.com/article/10.1007/s40815-017-0327-9) + + Parameters + ---------- + num_documents : int + The number of documents in the corpus. + vocabulary_size : int + Number of unique words in the corpus. + corpus : list of lists of str + The input file used to initialize the model. + word_to_index: dict {str : int} + Maps each unique vocabulary word to a unique index number. + + Returns + ------- + numpy.array : float + """ + binary_sparse_dtm = self._create_sparse_binary_dtm( + num_documents, + vocabulary_size, + corpus, + word_to_index, + ) + summed_words = binary_sparse_dtm.sum(0).tolist()[0] + return np.array([np.log2(num_documents / word_count) for word_count in summed_words]) + + @staticmethod + def _calculate_normal( + sparse_local_term_weights, + ): + """ + Use the normal word weightingg method. + + (See: https://link.springer.com/article/10.1007/s40815-017-0327-9) + + Parameters + ---------- + sparse_local_term_weights : scipy.sparse.dok_matrix + A sparse matrix showing the frequency of each words in documents. + + Returns + ------- + numpy.array : float + """ + squared_dtm = sparse_local_term_weights.multiply(sparse_local_term_weights) + summed_words = squared_dtm.sum(0).tolist()[0] + return np.array([1 / (math.sqrt(word_count)) for word_count in summed_words]) + + def _calculate_probidf( + self, + num_documents, + vocabulary_size, + corpus, + word_to_index, + ): + """ + Use the probidf word weightingg method. + + (See: https://link.springer.com/article/10.1007/s40815-017-0327-9) + + Parameters + ---------- + num_documents : int + The number of documents in the corpus. + vocabulary_size : int + Number of unique words in the corpus. + corpus : list of lists of str + The input file used to initialize the model. + word_to_index: dict {str : int} + Maps each unique vocabulary word to a unique index number. + + Returns + ------- + numpy.array : float + """ + binary_sparse_dtm = self._create_sparse_binary_dtm( + num_documents, + vocabulary_size, + corpus, + word_to_index, + ) + summed_binary_words_list = binary_sparse_dtm.sum(0).tolist()[0] + + return np.array([np.log2((num_documents - binary_word_count) / binary_word_count) + for binary_word_count in summed_binary_words_list]) + + @staticmethod + def _create_p_log_p_ij( + num_documents, + vocabulary_size, + sparse_local_term_weights, + index_to_word, + sum_words, + ): + """ + Create probability of word i in document j, multiplied by its base-2 logarithm. + + (See: https://link.springer.com/article/10.1007/s40815-017-0327-9) + + Parameters + ---------- + num_documents : int + The number of documents in the corpus. + vocabulary_size : int + Number of unique words in the corpus. + sparse_local_term_weights : scipy.sparse.dok_matrix + A sparse matrix showing the frequency of each words in documents. + index_to_word : dict {int : str} + Maps each unique index number to a unique vocabulary word. + sum_words : collections.Counter {str : int} + The count of each word in the corpus. + + Returns + ------- + scipy.sparse.dok_matrix + """ + p_log_p_ij = dok_matrix( + (num_documents, vocabulary_size), dtype=np.float32, + ) + for j in range(num_documents): + row_counts = sparse_local_term_weights.getrow(j).toarray()[0] + word_index = row_counts.nonzero()[0] + non_zero_row_counts = row_counts[row_counts != 0] + for i, count in enumerate(non_zero_row_counts): + word = index_to_word[word_index[i]] + prob_ij = count / sum_words[word] + p_log_p_ij[j, word_index[i]] = prob_ij * np.log2(prob_ij) + return p_log_p_ij + + @staticmethod + def _create_sparse_binary_dtm( + num_documents, + vocabulary_size, + corpus, + word_to_index, + ): + """ + Create a binary sparse document-term-matrix (used for idf and probidf). + + (See: https://link.springer.com/article/10.1007/s40815-017-0327-9) + + Parameters + ---------- + num_documents : int + The number of documents in the corpus. + vocabulary_size : int + Number of unique words in the corpus. + corpus : list of lists of str + The input file used to initialize the model. + word_to_index: dict {str : int} + Maps each unique vocabulary word to a unique index number. + + Returns + ------- + scipy.sparse.dok_matrix + """ + binary_sparse_dtm = dok_matrix( + (num_documents, vocabulary_size), dtype=np.float32, + ) + for doc_index, document in enumerate(corpus): + binary_document_counter = dict.fromkeys(document, 1) + for word in set(document): + binary_sparse_dtm[doc_index, + word_to_index[word]] = binary_document_counter[word] + return binary_sparse_dtm + + @staticmethod + def _create_projected_data( + algorithm, + sparse_weighted_matrix, + svd_factors, + ): + """ + Perform singular decomposition for dimensionality reduction. + + (See: https://web.mit.edu/be.400/www/SVD/Singular_Value_Decomposition.htm) + For SVD on a sparse matrix, the sparsesvd package is used + (https://pypi.org/project/sparsesvd/) + + Parameters + ---------- + algorithm : str + Indicator for which algorithm is being trained ('flsa' or 'flsa-w'). + sparse_weighted_matrix : scipy.sparse.dok_matrix + Sparse global term matrix. + svd_factors : int + The number of singular values to include. + + Returns + ------- + numpy.array : float + """ + svd_u, _, svd_v = svds( + sparse_weighted_matrix, + svd_factors, + ) + if algorithm in ['flsa']: + return svd_u + if algorithm in ['flsa-w']: + return svd_v.T + raise ValueError('Invalid algorithm selected.', + 'Only "flsa" ans "flsa-w" are currently supported.') + + @staticmethod + def _create_partition_matrix( + data, + number_of_clusters, + method='fcm', + ): + """ + Perform clustering on the projected data. + + The pyFUME package is used for clustering: + (https://pyfume.readthedocs.io/en/latest/Clustering.html) + + Parameters + ---------- + data: numpy.array + The output from self._create_projected_data(). + number_of_clusters : int + The number of clusters (topics). + method : str + The cluster method, choose from: 'fcm', 'gk', 'fst-pso'. + Returns + ------- + numpy.array : float + """ + clusterer = Clustering.Clusterer( + nr_clus=number_of_clusters, + data=data, + ) + _, partition_matrix, _ = clusterer.cluster(method=method) + return partition_matrix + + @staticmethod + def _create_prob_document_j(sparse_matrix): + """ + Get the probability of document j. + + Parameters + ---------- + sparse_matrix : scipy.sparse.dok_matrix + A sparse matrix representation of the global term weights. + Returns + ------- + numpy.array : float + (shape: number of documents x 1) + """ + # Vector with the length of num_document, + # each cell represents the sum of all weights of a document + document_sum = np.array([doc[0] for doc in sparse_matrix.sum(1).tolist()]) + # sum of all the elements in the weighted matrix + total_sum_d = sum(sparse_matrix.sum(0).tolist()[0]) + return document_sum / total_sum_d # normalized probability + + @staticmethod + def _create_prob_word_i(sparse_matrix): + """ + Get the probability of word i. + + Parameters + ---------- + sparse_matrix : scipy.sparse.dok_matrix + A sparse matrix representation of the global term weights. + + Returns + ------- + numpy.array : float + (shape: vocabulary_size x 1) + """ + word_sum = np.array(sparse_matrix.sum(0).tolist()) + # Sum of all the elements in the weighted matrix + total_sum_w = sum(sparse_matrix.sum(0).tolist()[0]) + return (word_sum / total_sum_w)[0] # normalized probability + + @staticmethod + def _create_prob_topic_k( + prob_topic_given_word_transpose, + prob_word_i, + ): + """ + Get the probability of topic k. + + Parameters + ---------- + prob_topic_given_word_transpose : numpy.array : float + The output from self._create_partition_matrix(). + prob_word_i : numpy.array : float + The output from self._create_prob_word_i(). + + Returns + ------- + numpy.array : float + (shape: 1 x number of topics) + """ + return np.matmul(prob_topic_given_word_transpose.T, prob_word_i) + + @staticmethod + def _check_passed_variables( + algorithm, + prob_topic_given_document_transpose, + prob_topic_given_word_transpose, + local_term_weights, + global_term_weights, + ): + """ + Check whether the algorithms are being fed the right attributes. + """ + if algorithm in ['flsa']: + if prob_topic_given_document_transpose is None: + raise ValueError("Please feed the method", + "'prob_topic_given_document_transpose' to run flsa") + if global_term_weights is None: + raise ValueError("Please feed the method 'global_term_weights', to run flsa") + elif algorithm in ['flsa-w']: + if prob_topic_given_word_transpose is None: + raise ValueError("Please feed the method", + "'prob_topic_given_word_transpose' to run flsa-w") + if global_term_weights is None: + raise ValueError("Please feed the method 'global_term_weights'", + " to run flsa-w") + elif algorithm in [ + 'flsa-e', + ]: + if prob_topic_given_word_transpose is None: + raise ValueError("Please feed the method", + "'prob_topic_given_word_transpose' to run model") + if local_term_weights is None: + raise ValueError("Please feed the method 'local_term_weights', to run model") + + else: + raise ValueError('Your algorithm is currently not supported') + + def _create_probability_matrices( + self, + algorithm, + prob_topic_given_document_transpose=None, + prob_topic_given_word_transpose=None, + local_term_weights=None, + global_term_weights=None, + ): + """ + Method that performs matrix multiplications to obtain the output matrices. + + The 'algorithm' parameter is generic and the other ones depend on the selected algorithm. + The other parameters passed into this method depend on the used algorithm. + + Parameters + ---------- + algorithm : str + Indicator for which algorithm is being trained ('flsa' or 'flsa-w'). + global_term_weights : scipy.sparse.dok_matrix + The output from self._create_partition_matrix(). + prob_topic_given_document_transpose : numpy.array : float + The output from self._create_partition_matrix() (flsa) + prob_topic_given_word_transpose : numpy.array : float + (flsa-w) + + Returns + ------- + numpy.array : float + The prbability of a word given a topic. + numpy.array : float + The prbability of a topic given a document. + """ + # Check whether the right variable are passed into the method. + self._check_passed_variables( + algorithm, + prob_topic_given_document_transpose, + prob_topic_given_word_transpose, + local_term_weights, + global_term_weights, + ) + + # Calculate the initial probabilities + if algorithm in [ + 'flsa', + 'flsa-w', + ]: + self._prob_word_i = self._create_prob_word_i(global_term_weights) + self._prob_document_j = self._create_prob_document_j(global_term_weights) + if algorithm in ['flsa-w']: + self._prob_topic_k = self._create_prob_topic_k( + prob_topic_given_word_transpose, + self._prob_word_i, + ) + elif algorithm in [ + 'flsa-e', + ]: + self._prob_word_i = self._create_prob_word_i(local_term_weights) + self._prob_document_j = self._create_prob_document_j(local_term_weights) + self._prob_topic_k = self._create_prob_topic_k( + prob_topic_given_word_transpose, self._prob_word_i, + ) + if algorithm in ['flsa']: + prob_document_and_topic = (prob_topic_given_document_transpose.T * self._prob_document_j).T + prob_document_given_topic = prob_document_and_topic / prob_document_and_topic.sum(axis=0) + self._prob_word_given_document = np.asarray(global_term_weights / global_term_weights.sum(1)) + self._prob_word_given_topic = np.matmul( + self._prob_word_given_document.T, + prob_document_given_topic, + ) + prob_topic_given_document = prob_topic_given_document_transpose.T + return self._prob_word_given_topic, prob_topic_given_document + + elif algorithm in [ + 'flsa-w', + 'flsa-e' + ]: + prob_word_and_topic = (prob_topic_given_word_transpose.T * self._prob_word_i).T + self._prob_word_given_topic = prob_word_and_topic / prob_word_and_topic.sum(axis=0) + if algorithm in ['flsa-w']: + self._prob_word_given_document = np.asarray(global_term_weights / global_term_weights.sum(1)).T + elif algorithm in [ + 'flsa-e', + ]: + self._prob_word_given_document = np.asarray(local_term_weights / local_term_weights.sum(1)).T + prob_document_given_word = ((self._prob_word_given_document * self._prob_document_j).T + / np.array(self._prob_word_i)) + prob_document_given_topic = np.matmul( + prob_document_given_word, + self._prob_word_given_topic, + ) + prob_topic_given_document = ((prob_document_given_topic * self._prob_topic_k).T + / self._prob_document_j) + return self._prob_word_given_topic, prob_topic_given_document + raise ValueError('"algorithm" is unknown.') + + @staticmethod + def _create_dictlist_topn( + topn, + prob_word_given_topic, + index_to_word, + ): + """ + Creates a list with dictionaries of word probabilities + per topic based on the top-n words. + + Parameters + ---------- + topn : int + The top-n words to include + (needs only to be used when 'method=topn'). + prob_word_given_topic : numpy.array : float + Matrix that gives the probability of a word given a topic. + index_to_word : dict {int : str} + Maps each unique index number to a unique vocabulary word. + + Returns + ------- + list of dicts {int : float} + Keys: all the indices of words from prob_word_given_topic + who's weight's are amongst the top percentage. + Values: the probability associated to a word. + """ + if not isinstance(topn, int) and topn > 0: + raise ValueError("Please choose a positive integer for 'topn'") + top_dictionaries = [] + for topic_index in range(prob_word_given_topic.shape[1]): + new_dict = dict() + highest_weight_indices = prob_word_given_topic[:, topic_index].argsort()[-topn:] + for word_index in highest_weight_indices: + new_dict[index_to_word[word_index]] = prob_word_given_topic[ + word_index, topic_index, + ] + top_dictionaries.append(new_dict) + return top_dictionaries + + @staticmethod + def _create_dictlist_percentile( + perc, + prob_word_given_topic, + index_to_word, + ): + """ + Create a list with dictionaries of word probabilities + per topic based on the percentile. + - Keys: all the indices of words from prob_word_given_topic + who's weight's are amongst the top percentage. + - Values: the probability associated to a word. + + Parameters + ---------- + perc : float + The top percentile words to include + (needs only to be used when 'method=percentile'). + prob_word_given_topic : numpy.array : float + Matrix that gives the probability of a word given a topic. + index_to_word : dict {int : str} + Maps each unique index number to a unique vocabulary word. + + Returns + ------- + list of dicts {int : float} + Keys: all the indices of words from prob_word_given_topic + who's weight's are amongst the top percentage. + Values: the probability associated to a word. + """ + if not isinstance(perc, float) and 0 <= perc <= 1: + raise ValueError("Please choose a number between 0 and 1 for 'perc'") + top_list = [] + for top in range(prob_word_given_topic.shape[1]): + new_dict = dict() + count = 0 + i = 0 + weights = np.sort(prob_word_given_topic[:, top])[::-1] + word_indices = np.argsort(prob_word_given_topic[:, top])[::-1] + while count < perc: + new_dict[index_to_word[word_indices[i]]] = weights[i] + count += weights[i] + i += 1 + top_list.append(new_dict) + return top_list + + def show_topics( + self, + formatted=True, + prob_word_given_topic=None, + num_words=-1, + index_to_word=None, + ): + """ + Get a representation for the topics. + + Parameters + ---------- + formatted : bool + Whether the topic representations should be formatted as strings. + If False, they are returned as 2 tuples of (word, probability). + prob_word_given_topic : numpy.array : float + Matrix that gives the probability of a word given a topic. + num_words : int + Indicates how many words per topic should be shown. + index_to_word : dict {int : str} + Maps each unique index number to a unique vocabulary word. + + Returns + ------- + list of tuples (int, str) + The produced topics. + """ + if prob_word_given_topic is None: + prob_word_given_topic = self._prob_word_given_topic + + if num_words < 0: + num_words = self.num_words + if index_to_word is None: + index_to_word = self._index_to_word + if not isinstance(prob_word_given_topic, np.ndarray): + raise TypeError("Please feed the algorithm 'prob_word_given_topic' as a np.ndarray") + if not isinstance(index_to_word, dict): + raise TypeError("Please feed the algorithm 'index_to_word' as a dict") + if not isinstance(num_words, int) or num_words <= 0: + raise TypeError("Please use a positive int for 'num_words'.") + if prob_word_given_topic.shape[0] < prob_word_given_topic.shape[1]: + raise ValueError("'prob_word_given_topic' has more columns then rows,", + " probably you need to take the transpose.") + warning = ["It seems like 'prob_word_given_topic' and 'index_to_word", + "are not aligned. The number of vocabulary words in", + "'prob_word_given_topic' deviate from the ", + "number of words in 'index_to_word'."] + if prob_word_given_topic.shape[0] != len(index_to_word.keys()): + warnings.warn(' '.join(warning)) + if not isinstance(formatted, bool): + raise ValueError('Please choose a boolean for "formatted"') + topic_list = [] + if not formatted: + for topic_index in range(prob_word_given_topic.shape[1]): + weight_words = "" + sorted_highest_weight_indices = prob_word_given_topic[:, topic_index].argsort()[-num_words:][::-1] + for word_index in sorted_highest_weight_indices: + weight_words += (str(round(prob_word_given_topic[word_index, topic_index], 4)) + + '*"' + index_to_word[word_index] + '" + ') + topic_list.append((topic_index, weight_words[:-3])) + return topic_list + else: + for topic_index in range(prob_word_given_topic.shape[1]): + word_list = [] + sorted_highest_weight_indices = prob_word_given_topic[:, topic_index].argsort()[-num_words:][::-1] + for word_index in sorted_highest_weight_indices: + word_list.append(index_to_word[word_index]) + topic_list.append(word_list) + return topic_list + + def get_topic_embedding( + self, + corpus, + prob_word_given_topic=None, + method='topn', + topn=20, + perc=0.05, + ): + """ + Create a topic embedding for each input document, + to be used as input to predictive models. + + Parameters + ---------- + corpus : list of lists of str + The input file used to initialize the model. + prob_word_given_topic : numpy.array : float + Matrix that gives the probability of a word given a topic. + method : str + Method to select words to be included in the embedding. + (choose from 'topn', 'percentile'): + - topn: for each topic the top n words with the highest + probability are included. + - percentile: for each topic all words with highest + probabilities are assigned while the cumulative + probability is lower than the percentile. + topn : int + The top-n words to include + (needs only to be used when 'method=topn'). + perc: float + The benchmark percentile until which words need to be added + (between 0 and 1). + + Returns + ------- + numpy.array : float + Array in which each row gives the topic embedding for + the associated document. + """ + self._check_variables() + if prob_word_given_topic is None: + prob_word_given_topic = self._prob_word_given_topic + top_dist = [] + if method not in ['topn', 'percentile']: + raise ValueError(method, "is not a valid option for 'method'.", + " Choose either 'topn' or 'percentile'") + if method == 'topn': + dictlist = self._create_dictlist_topn( + topn, prob_word_given_topic, self._index_to_word, + ) + else: + dictlist = self._create_dictlist_percentile( + perc, prob_word_given_topic, self._index_to_word, + ) + for doc in corpus: + topic_weights = [0] * prob_word_given_topic.shape[1] + for word in doc: + for i in range(prob_word_given_topic.shape[1]): + topic_weights[i] += dictlist[i].get(word, 0) + top_dist.append(topic_weights) + return np.array(top_dist) + + def get_coherence_score( + self, + corpus=None, + topics=None, + coherence='c_v', + ): + """ + Calculate the coherence score for the generated topic. + + Parameters + ---------- + corpus : list of lists of str + The input file used to initialize the model. + topics : list of lists of str + The words per topics, + equivalent to self.show_topics(formatted=True). + coherence : str + The type of coherence to be calculated. + Choose from: 'u_mass', 'c_v', 'c_uci', 'c_npmi'. + + Returns + ------- + float + The coherence score. + """ + if corpus is None and topics is None: + corpus = self.corpus + topics = self.show_topics(formatted=True) + + id2word = corpora.Dictionary(corpus) + corpus_bow = [id2word.doc2bow(text) for text in corpus] + self.coherence_score = CoherenceModel( + topics=topics, + texts=corpus, + corpus=corpus_bow, + dictionary=id2word, + coherence=coherence, + topn=len(topics[0]), + ).get_coherence() + return self.coherence_score + + def get_diversity_score( + self, + topics=None, + ): + """'' + Calculate the diversity score for the generated topic. + + Diversity = number of unique words / number of total words. + See: https://tinyurl.com/2bs84zd8 + + Parameters + ---------- + topics : list of lists of str + The words per topics, + equivalent to self.show_topics(formatted=True). + + Returns + ------- + float + The diversity score. + """ + if topics is None: + topics = self.show_topics(formatted=True) + unique_words = set() + total_words = 0 + for top in topics: + unique_words.update(top) + total_words += len(top) + self.diversity_score = len(unique_words) / total_words + return self.diversity_score + + def get_interpretability_score( + self, + corpus=None, + topics=None, + coherence='c_v', + ): + """'' + Calculate the interpretability score for the generated topics. + + Interpretability = coherence * diversity. + (see: https://tinyurl.com/2bs84zd8) + + Parameters + ---------- + corpus : list of lists of str + The input file used to initialize the model. + topics : list of lists of str + The words per topics, equivalent to + self.show_topics(formatted=True). + coherence : str + The type of coherence to be calculated. + Choose from: 'u_mass', 'c_v', 'c_uci', 'c_npmi'. + + Returns + ------- + float + The interpretability score. + """ + if corpus is None and topics is None: + corpus = self.corpus + topics = self.show_topics(formatted=True) + if self.coherence_score is None: + self.coherence_score = self.get_coherence_score( + corpus, + topics, + coherence, + ) + if self.diversity_score is None: + self.diversity_score = self.get_diversity_score(topics) + return self.coherence_score * self.diversity_score + + def get_vocabulary(self): + """ + Returns a set of all the words in the corpus + + Example: + After initializing an instance of the flsamodel models as 'model' + + corpus = [['this', 'is', 'the', 'first', 'file'], + ['and', 'this', 'is', 'second', 'file']] + + model.get_vocabulary() + + >>> {'this', 'is', 'the', 'first', 'file', 'and', 'second'} + """ + return self._vocabulary + + def get_topics(self): + """ + Get the term-topic matrix. + + Returns + ------- + numpy.ndarray + The probability for each word in each topic, + shape (num_topics, vocabulary_size). + """ + return self.pwgt + + def get_vocabulary_size(self): + """ + Returns the number of words in the vocabulary + + Example: + After initializing an instance of the flsamodel models as 'model' + + corpus = [['this', 'is', 'the', 'first', 'file'], + ['and', 'this', 'is', 'second', 'file']] + + model.get_vocabulary_size() + + >>> 7 + """ + return self._vocabulary_size + + def get_word_to_index(self): + """ + Obtain a dictionary that maps each vocabulary word to an index. + + Returns + ------- + dict of {str : int} + word to int mapping. + """ + return self._word_to_index + + def get_index_to_word(self): + """ + Obtain a dictionary that maps index numbers to vocabulary words. + + Returns + ------- + dict of {int : str} + int to word mapping. + """ + return self._index_to_word + + def get_corpus(self): + """ + Return the input file. + + Returns + ------- + list of list of str + The input file 'corpus'. + """ + return self.corpus + + def get_prob_word_i(self): + """ + Return the probabilities per word. + + Returns + ------- + np.array of float + The probabilities per word. + """ + return self._prob_word_i + + def get_prob_document_j(self): + """ + Return the probabilities per document. + + Returns + ------- + np.array of float + The probabilities per document. + """ + return self._prob_document_j + + def get_prob_topic_k(self): + """ + Return the probabilities per topic. + + Returns + ------- + np.array of float + The probabilities per topic. + """ + return self._prob_topic_k + + def save( + self, + filepath, + ): + """'' + Saves the object to the drive, using the pickle library. + + Parameters + ---------- + filepath : str + The directory in which the file should be stored, + either with or without the file name. + + Returns + ------- + float + The interpretability score. + """ + if not isinstance(filepath, str): + raise ValueError('Make sure that "filepath" has type "str"') + if filepath.endswith('.pickle'): + pickle_out = open(filepath, 'wb') + elif filepath.endswith('/'): + pickle_out = open(filepath + 'model.pickle', 'wb') + else: + pickle_out = open(filepath + '.pickle', 'wb') + pickle.dump(self, pickle_out) + pickle_out.close() + + def load( + self, + filepath, + ): + """'' + Loads the object from the drive, using the pickle library. + + Parameters + ---------- + filepath : str + The directory in which the file should be stored, + either with or without the file name. + + Returns + ------- + float + The interpretability score. + """ + if not isinstance(filepath, str): + raise ValueError('Make sure that "filepath" has type "str"') + if not filepath.endswith('.pickle'): + if filepath.endswith('/'): + filepath += 'model.pickle' + else: + filepath += '/model.pickle' + infile = open(filepath, 'rb') + self.__dict__ = pickle.load(infile).__dict__ + infile.close() + + +class Flsa(FlsaModel): + """ + Class to run the FLSA algorithm (see: https://tinyurl.com/mskjaeuu). + + Parameters + ---------- + corpus : list of lists of str + The input file used to initialize the model. + num_topics : int + The number of topics that the model should train. + num_words : int + Indicates how many words per topic should be shown. + word_weighting : str + Indicates the method used for word_weighting. Choose from: + - entropy + - normal + - idf + - probidf + svd_factors : int + The number of singular values to include. + cluster_method : str + The cluster algorithm to be used ('fcm', 'gk', 'fst-pso'). + """ + def __init__( + self, + corpus, + num_topics, + num_words=10, + word_weighting='normal', + svd_factors=2, + cluster_method='fcm', + ): + super().__init__( + algorithm='flsa', + corpus=corpus, + num_topics=num_topics, + num_words=num_words, + word_weighting=word_weighting, + cluster_method=cluster_method, + svd_factors=svd_factors, + ) + + def _get_matrices(self): + """ + Method to obtain the matrices after the model has been initialized. + + Returns + ------- + numpy.array : float + The prbability of a word given a topic. + numpy.array : float + The prbability of a topic given a document. + """ + sparse_document_term_matrix = self._create_sparse_local_term_weights( + self.corpus, + self._vocabulary_size, + self._word_to_index, + ) + sparse_global_term_weighting = self._create_sparse_global_term_weights( + corpus=self.corpus, + word_weighting=self.word_weighting, + vocabulary_size=self._vocabulary_size, + sparse_local_term_weights=sparse_document_term_matrix, + index_to_word=self._index_to_word, + word_to_index=self._word_to_index, + sum_words=self._sum_words, + ) + projected_data = self._create_projected_data( + algorithm='flsa', + sparse_weighted_matrix=sparse_global_term_weighting, + svd_factors=self.svd_factors, + ) + partition_matrix = self._create_partition_matrix( + data=projected_data, + number_of_clusters=self.num_topics, + method=self.cluster_method + ) + return self._create_probability_matrices( + algorithm='flsa', + prob_topic_given_document_transpose=partition_matrix, + global_term_weights=sparse_global_term_weighting, + ) + + +class FlsaW(FlsaModel): + """ + Class to train the FLSA-W algorithm. + + See: https://ieeexplore.ieee.org/abstract/document/9660139 + + Parameters + ---------- + corpus : list of lists of str + The input file used to initialize the model. + num_topics : int + The number of topics that the model should train. + num_words : int + Indicates how many words per topic should be shown. + word_weighting : str + Indicates the method used for word_weighting. Choose from: + - entropy + - normal + - idf + - probidf + svd_factors : int + The number of singular values to include. + cluster_method : str + The cluster algorithm to be used ('fcm', 'gk', 'fst-pso'). + """ + def __init__( + self, + corpus, + num_topics, + num_words=10, + word_weighting='normal', + svd_factors=2, + cluster_method='fcm', + ): + + super().__init__( + algorithm='flsa-w', + corpus=corpus, + num_topics=num_topics, + num_words=num_words, + word_weighting=word_weighting, + cluster_method=cluster_method, + svd_factors=svd_factors, + ) + + def _get_matrices(self): + """ + Method to obtain the matrices after the model has been initialized. + + Returns + ------- + numpy.array : float + The prbability of a word given a topic. + numpy.array : float + The prbability of a topic given a document. + """ + sparse_document_term_matrix = self._create_sparse_local_term_weights( + self.corpus, + self._vocabulary_size, + self._word_to_index, + ) + sparse_global_term_weighting = self._create_sparse_global_term_weights( + corpus=self.corpus, + word_weighting=self.word_weighting, + vocabulary_size=self._vocabulary_size, + sparse_local_term_weights=sparse_document_term_matrix, + index_to_word=self._index_to_word, + word_to_index=self._word_to_index, + sum_words=self._sum_words, + ) + projected_data = self._create_projected_data( + algorithm='flsa-w', + sparse_weighted_matrix=sparse_global_term_weighting, + svd_factors=self.svd_factors, + ) + partition_matrix = self._create_partition_matrix( + data=projected_data, + number_of_clusters=self.num_topics, + method=self.cluster_method, + ) + return self._create_probability_matrices( + algorithm='flsa-w', + prob_topic_given_word_transpose=partition_matrix, + global_term_weights=sparse_global_term_weighting, + ) + + +class FlsaE(FlsaModel): + """ + Class to train the FLSA-E algorithm. See: https://tinyurl.com/5n8utppk + + Parameters + ---------- + corpus : list of lists of str + The input file used to initialize the model. + num_topics : int + The number of topics that the model should train. + num_words : int + Indicates how many words per topic should be shown. + cluster_method : str + The cluster algorithm to be used ('fcm', 'gk', 'fst-pso'). + min_count : int + Ignores all words with total frequency lower than this. + window : int + Maximum distance between the current and predicted word within a sentence. + vector_size : int + Dimensionality of the word vectors. + workers : int + Use these many worker threads to train the model + ( = faster training with multicore machines). + """ + + def __init__( + self, + corpus, + num_topics, + num_words=10, + cluster_method='fcm', + min_count=1, + window=5, + vector_size=20, + workers=4, + ): + + self.model = ... + self.word_embedding = ... + + super().__init__( + algorithm='flsa-e', + corpus=corpus, + num_topics=num_topics, + num_words=num_words, + cluster_method=cluster_method, + min_count=min_count, + window=window, + vector_size=vector_size, + workers=workers, + ) + + def get_word_embedding( + self, + data, + vector_size, + window, + min_count, + workers, + ): + """ + Method to train a word embedding on the corpus. + + Parameters + ---------- + data : list of lists of str + The input file used to initialize the model. + min_count : int + Ignores all words with total frequency lower than this. + window : int + Maximum distance between the current and predicted word within a sentence. + vector_size : int + Dimensionality of the word vectors. + workers : int + Use these many worker threads to train the model + ( = faster training with multicore machines). + """ + + self.model = Word2Vec( + sentences=data, + vector_size=vector_size, + window=window, + min_count=min_count, + workers=workers, + ) + + return self.model.wv.vectors + + def _get_matrices( + self, + ): + ''' + Method to run after the FLSA_E class has been initialized to obtain the output matrices. + + Returns: + - Numpy array: prob_word_given_topic + - Numpy array: prob_topic_given_document + ''' + sparse_document_term_matrix = self._create_sparse_local_term_weights( + self.corpus, + self._vocabulary_size, + self._word_to_index, + ) + + self.word_embedding = self.get_word_embedding( + data=self.corpus, + min_count=self.min_count, + vector_size=self.vector_size, + window=self.window, + workers=self.workers, + ) + + partition_matrix = self._create_partition_matrix( + data=self.word_embedding, + number_of_clusters=self.num_topics, + method=self.cluster_method, + ) + + return self._create_probability_matrices( + algorithm='flsa-e', + prob_topic_given_word_transpose=partition_matrix, + local_term_weights=sparse_document_term_matrix, + ) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 8f86b807f2..b09f440f92 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -9,8 +9,7 @@ and various similarity look-ups. Since trained word vectors are independent from the way they were trained (:class:`~gensim.models.word2vec.Word2Vec`, -:class:`~gensim.models.fasttext.FastText`, -:class:`~gensim.models.wrappers.varembed.VarEmbed` etc), they can be represented by a standalone structure, +:class:`~gensim.models.fasttext.FastText` etc), they can be represented by a standalone structure, as implemented in this module. The structure is called "KeyedVectors" and is essentially a mapping between *keys* @@ -518,7 +517,7 @@ def get_mean_vector(self, keys, weights=None, pre_normalize=True, post_normalize elif not ignore_missing: raise KeyError(f"Key '{key}' not present in vocabulary") - if(total_weight > 0): + if total_weight > 0: mean = mean / total_weight if post_normalize: mean = matutils.unitvec(mean).astype(REAL) @@ -593,7 +592,7 @@ def add_vectors(self, keys, weights, extras=None, replace=False): in_vocab_mask = np.zeros(len(keys), dtype=bool) for idx, key in enumerate(keys): - if key in self: + if key in self.key_to_index: in_vocab_mask[idx] = True # add new entities to the vocab @@ -919,10 +918,8 @@ def wmdistance(self, document1, document2, norm=True): When using this code, please consider citing the following papers: - * `Ofir Pele and Michael Werman "A linear time histogram metric for improved SIFT matching" - `_ - * `Ofir Pele and Michael Werman "Fast and robust earth mover's distances" - `_ + * `Rémi Flamary et al. "POT: Python Optimal Transport" + `_ * `Matt Kusner et al. "From Word Embeddings To Document Distances" `_. @@ -943,7 +940,7 @@ def wmdistance(self, document1, document2, norm=True): Warnings -------- - This method only works if `pyemd `_ is installed. + This method only works if `POT `_ is installed. If one of the documents have no words that exist in the vocab, `float('inf')` (i.e. infinity) will be returned. @@ -951,12 +948,11 @@ def wmdistance(self, document1, document2, norm=True): Raises ------ ImportError - If `pyemd `_ isn't installed. + If `POT `_ isn't installed. """ - # If pyemd C extension is available, import it. - # If pyemd is attempted to be used, but isn't installed, ImportError will be raised in wmdistance - from pyemd import emd + # If POT is attempted to be used, but isn't installed, ImportError will be raised in wmdistance + from ot import emd2 # Remove out-of-vocabulary words. len_pre_oov1 = len(document1) @@ -1003,12 +999,12 @@ def nbow(document): d[idx] = freq / float(doc_len) # Normalized word frequencies. return d - # Compute nBOW representation of documents. This is what pyemd expects on input. + # Compute nBOW representation of documents. This is what POT expects on input. d1 = nbow(document1) d2 = nbow(document2) # Compute WMD. - return emd(d1, d2, distance_matrix) + return emd2(d1, d2, distance_matrix) def most_similar_cosmul( self, positive=None, negative=None, topn=10, restrict_vocab=None @@ -1168,7 +1164,7 @@ def cosine_similarities(vector_1, vectors_all): def distances(self, word_or_vector, other_words=()): """Compute cosine distances from given word or vector to all words in `other_words`. - If `other_words` is empty, return distance between `word_or_vectors` and all words in vocab. + If `other_words` is empty, return distance between `word_or_vector` and all words in vocab. Parameters ---------- @@ -1253,7 +1249,7 @@ def n_similarity(self, ws1, ws2): Similarities between `ws1` and `ws2`. """ - if not(len(ws1) and len(ws2)): + if not (len(ws1) and len(ws2)): raise ZeroDivisionError('At least one of the passed list is empty.') mean1 = self.get_mean_vector(ws1, pre_normalize=False) mean2 = self.get_mean_vector(ws2, pre_normalize=False) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 10a0c60134..b5f8017f07 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -314,7 +314,8 @@ def load(cls, fname, *args, **kwargs): class LdaModel(interfaces.TransformationABC, basemodel.BaseTopicModel): - """Train and use Online Latent Dirichlet Allocation model as presented in `'Online Learning for LDA' by Hoffman et al.`_ + """Train and use Online Latent Dirichlet Allocation model as presented in + `'Online Learning for LDA' by Hoffman et al.`_ Examples ------- diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 0f222c9c6c..8ffcb5eee6 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -4,7 +4,8 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html # Based on Copyright (C) 2016 Radim Rehurek -"""Lda Sequence model, inspired by `David M. Blei, John D. Lafferty: "Dynamic Topic Models" +"""Lda Sequence model, inspired by +`David M. Blei, John D. Lafferty: "Dynamic Topic Models" `_. The original C/C++ implementation can be found on `blei-lab/dtm `_. @@ -744,7 +745,8 @@ def update_zeta(self): return self.zeta def compute_post_variance(self, word, chain_variance): - r"""Get the variance, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1) + r"""Get the variance, based on the + `Variational Kalman Filtering approach for Approximate Inference (section 3.1) `_. This function accepts the word to compute variance for, along with the associated sslm class object, diff --git a/gensim/models/nmf_pgd.pyx b/gensim/models/nmf_pgd.pyx index dff480cdb4..2419272e5b 100644 --- a/gensim/models/nmf_pgd.pyx +++ b/gensim/models/nmf_pgd.pyx @@ -1,5 +1,6 @@ # Author: Timofey Yefimov +# cython: language_level=3 # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py index 528e3d6fa2..246482421f 100644 --- a/gensim/models/translation_matrix.py +++ b/gensim/models/translation_matrix.py @@ -5,7 +5,7 @@ a standard nearest neighbour method or a globally corrected neighbour retrieval method [1]_. This method can be used to augment the existing phrase tables with more candidate translations, or -filter out errors from the translation tables and known dictionaries [2]_. What's more, It also work +filter out errors from the translation tables and known dictionaries [2]_. What's more, it also works for any two sets of named-vectors where there are some paired-guideposts to learn the transformation. Examples @@ -14,7 +14,7 @@ How to make translation between two set of word-vectors ======================================================= -Initialize a word-vector models +Initialize two word-vector models .. sourcecode:: pycon @@ -24,7 +24,7 @@ >>> model_en = KeyedVectors.load_word2vec_format(datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")) >>> model_it = KeyedVectors.load_word2vec_format(datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")) -Define word pairs (that will be used for construction of translation matrix +Define word pairs (that will be used for construction of translation matrix) .. sourcecode:: pycon @@ -143,12 +143,12 @@ def build(cls, lang_vec, lexicon=None): Object that stored word-vectors """ - # `words` to store all the word that - # `mat` to store all the word vector for the word in 'words' list + # `words` to store all the words + # `mat` to store the word vector for each word in the 'words' list words = [] mat = [] if lexicon is not None: - # if the lexicon is not provided, using the all the Keyedvectors's words as default + # if the lexicon is not provided, using all the Keyedvectors's words as default for item in lexicon: words.append(item) mat.append(lang_vec.vectors[lang_vec.get_index(item)]) @@ -161,18 +161,18 @@ def build(cls, lang_vec, lexicon=None): return Space(mat, words) def normalize(self): - """Normalize the word vector's matrix.""" - self.mat = self.mat / np.sqrt(np.sum(np.multiply(self.mat, self.mat), axis=1, keepdims=True)) + """Normalize the word vectors matrix.""" + self.mat = self.mat / np.sqrt(np.sum(np.square(self.mat), axis=1, keepdims=True)) class TranslationMatrix(utils.SaveLoad): - """Objects of this class realize the translation matrix which map the source language to the target language. + """Objects of this class realize the translation matrix which maps the source language to the target language. The main methods are: We map it to the other language space by computing z = Wx, then return the word whose representation is close to z. - The details use seen the notebook [3]_ + For details on use, see the tutorial notebook [3]_ Examples -------- @@ -234,7 +234,7 @@ def __init__(self, source_lang_vec, target_lang_vec, word_pairs=None, random_sta self.train(word_pairs) def train(self, word_pairs): - """Build the translation matrix that mapping from source space to target space. + """Build the translation matrix to map from source space to target space. Parameters ---------- @@ -289,7 +289,7 @@ def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec Define translation algorithm, if `gc == 0` - use standard NN retrieval, otherwise, use globally corrected neighbour retrieval method (as described in [1]_). sample_num : int, optional - Number of word to sample from the source lexicon, if `gc == 1`, then `sample_num` **must** be provided. + Number of words to sample from the source lexicon, if `gc == 1`, then `sample_num` **must** be provided. source_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors`, optional New source language vectors for translation, by default, used the model's source language vector. target_lang_vec : :class:`~gensim.models.keyedvectors.KeyedVectors`, optional @@ -366,15 +366,15 @@ def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec class BackMappingTranslationMatrix(utils.SaveLoad): - """Realize the BackMapping translation matrix which map the source model's document vector - to the target model's document vector(old model). + """Realize the BackMapping translation matrix which maps the source model's document vector + to the target model's document vector (old model). - BackMapping translation matrix is used to learn a mapping for two document vector space which we - specify as source document vector and target document vector. The target document vector are trained - on superset corpus of source document vector, we can incrementally increase the vector in + BackMapping translation matrix is used to learn a mapping for two document vector spaces which we + specify as source document vector and target document vector. The target document vectors are trained + on a superset corpus of source document vectors; we can incrementally increase the vector in the old model through the BackMapping translation matrix. - the details use seen the notebook [3]_. + For details on use, see the tutorial notebook [3]_. Examples -------- @@ -421,7 +421,7 @@ def __init__(self, source_lang_vec, target_lang_vec, tagged_docs=None, random_st self.train(tagged_docs) def train(self, tagged_docs): - """Build the translation matrix that mapping from the source model's vector to target model's vector + """Build the translation matrix to map from the source model's vectors to target model's vectors Parameters ---------- @@ -432,7 +432,7 @@ def train(self, tagged_docs): Returns ------- numpy.ndarray - Translation matrix that mapping from the source model's vector to target model's vector. + Translation matrix that maps from the source model's vectors to target model's vectors. """ m1 = [self.source_lang_vec.dv[item.tags].flatten() for item in tagged_docs] diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 061dcfc817..d4a4ba992e 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -68,7 +68,7 @@ >>> sims = model.wv.most_similar('computer', topn=10) # get other similar words The reason for separating the trained vectors into `KeyedVectors` is that if you don't -need the full model state any more (don't need to continue training), its state can discarded, +need the full model state any more (don't need to continue training), its state can be discarded, keeping just the vectors and their keys proper. This results in a much smaller and faster object that can be mmapped for lightning @@ -200,6 +200,9 @@ from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector from gensim import utils, matutils +# This import is required by pickle to load models stored by Gensim < 4.0, such as Gensim 3.8.3. +from gensim.models.keyedvectors import Vocab # noqa + from smart_open.compression import get_supported_extensions logger = logging.getLogger(__name__) @@ -1983,9 +1986,6 @@ def _load_specials(self, *args, **kwargs): for a in ('hashfxn', 'layer1_size', 'seed', 'syn1neg', 'syn1'): if hasattr(self.trainables, a): setattr(self, a, getattr(self.trainables, a)) - if hasattr(self, 'syn1'): - self.syn1 = self.syn1 - del self.syn1 del self.trainables if not hasattr(self, 'shrink_windows'): self.shrink_windows = True diff --git a/gensim/models/word2vec_corpusfile.pxd b/gensim/models/word2vec_corpusfile.pxd index 7030686916..56e8cb64ee 100644 --- a/gensim/models/word2vec_corpusfile.pxd +++ b/gensim/models/word2vec_corpusfile.pxd @@ -1,4 +1,5 @@ # distutils: language = c++ +# cython: language_level=3 # cython: boundscheck=False # cython: wraparound=False # cython: cdivision=True diff --git a/gensim/models/word2vec_corpusfile.pyx b/gensim/models/word2vec_corpusfile.pyx index 5d7f5004e4..a2b962aed6 100644 --- a/gensim/models/word2vec_corpusfile.pyx +++ b/gensim/models/word2vec_corpusfile.pyx @@ -1,5 +1,6 @@ #!/usr/bin/env cython # distutils: language = c++ +# cython: language_level=3 # cython: boundscheck=False # cython: wraparound=False # cython: cdivision=True diff --git a/gensim/models/word2vec_inner.pxd b/gensim/models/word2vec_inner.pxd index 82abad2f05..4b4523dc55 100644 --- a/gensim/models/word2vec_inner.pxd +++ b/gensim/models/word2vec_inner.pxd @@ -1,3 +1,4 @@ +# cython: language_level=3 # cython: boundscheck=False # cython: wraparound=False # cython: cdivision=True diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index ffdc908b5c..1c0807ee0f 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -1,4 +1,5 @@ #!/usr/bin/env cython +# cython: language_level=3 # cython: boundscheck=False # cython: wraparound=False # cython: cdivision=True @@ -19,12 +20,7 @@ from libc.math cimport exp from libc.math cimport log from libc.string cimport memset -# scipy <= 0.15 -try: - from scipy.linalg.blas import fblas -except ImportError: - # in scipy > 0.15, fblas function has been removed - import scipy.linalg.blas as fblas +import scipy.linalg.blas as fblas REAL = np.float32 @@ -126,7 +122,7 @@ cdef void w2v_fast_sentence_sg_hs( if _compute_loss == 1: sgn = (-1)**word_code[b] # ch function: 0-> 1, 1 -> -1 - lprob = -1*sgn*f_dot + lprob = sgn*f_dot if lprob <= -MAX_EXP or lprob >= MAX_EXP: continue lprob = LOG_TABLE[((lprob + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] @@ -326,7 +322,7 @@ cdef void w2v_fast_sentence_cbow_hs( if _compute_loss == 1: sgn = (-1)**word_code[b] # ch function: 0-> 1, 1 -> -1 - lprob = -1*sgn*f_dot + lprob = sgn*f_dot if lprob <= -MAX_EXP or lprob >= MAX_EXP: continue lprob = LOG_TABLE[((lprob + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index cdb966547d..2e46479f87 100644 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -73,6 +73,7 @@ import itertools import os import heapq +import warnings import numpy import scipy.sparse @@ -906,7 +907,8 @@ class SoftCosineSimilarity(interfaces.SimilarityABC): for more examples. """ - def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256, normalized=(True, True)): + def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256, normalized=None, + normalize_queries=True, normalize_documents=True): """ Parameters @@ -919,12 +921,19 @@ def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256, norm The number of results to retrieve for a query, if None - return similarities with all elements from corpus. chunksize: int, optional Size of one corpus chunk. - normalized : tuple of {True, False, 'maintain'}, optional - First/second value specifies whether the query/document vectors in the inner product - will be L2-normalized (True; corresponds to the soft cosine similarity measure; default), - maintain their L2-norm during change of basis ('maintain'; corresponds to query - expansion with partial membership), or kept as-is (False; - corresponds to query expansion). + normalized : tuple of {True, False, 'maintain', None}, optional + A deprecated alias for `(normalize_queries, normalize_documents)`. If None, use + `normalize_queries` and `normalize_documents`. Default is None. + normalize_queries : {True, False, 'maintain'}, optional + Whether the query vector in the inner product will be L2-normalized (True; corresponds + to the soft cosine similarity measure; default), maintain their L2-norm during change of + basis ('maintain'; corresponds to queryexpansion with partial membership), or kept as-is + (False; corresponds to query expansion). + normalize_documents : {True, False, 'maintain'}, optional + Whether the document vector in the inner product will be L2-normalized (True; corresponds + to the soft cosine similarity measure; default), maintain their L2-norm during change of + basis ('maintain'; corresponds to queryexpansion with partial membership), or kept as-is + (False; corresponds to query expansion). See Also -------- @@ -941,7 +950,14 @@ def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256, norm self.corpus = list(corpus) self.num_best = num_best self.chunksize = chunksize - self.normalized = normalized + if normalized is not None: + warnings.warn( + 'Parameter normalized will be removed in 5.0.0, use normalize_queries and normalize_documents instead', + category=DeprecationWarning, + ) + self.normalized = normalized + else: + self.normalized = (normalize_queries, normalize_documents) # Normalization of features is undesirable, since soft cosine similarity requires special # normalization using the similarity matrix. Therefore, we would just be normalizing twice, @@ -998,10 +1014,8 @@ class WmdSimilarity(interfaces.SimilarityABC): When using this code, please consider citing the following papers: - * `Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching" - `_ - * `Ofir Pele and Michael Werman, "Fast and robust earth mover's distances" - `_ + * `Rémi Flamary et al. "POT: Python Optimal Transport" + `_ * `Matt Kusner et al. "From Word Embeddings To Document Distances" `_ @@ -1102,6 +1116,50 @@ def __str__(self): class SparseMatrixSimilarity(interfaces.SimilarityABC): """Compute cosine similarity against a corpus of documents by storing the index matrix in memory. + Examples + -------- + Here is how you would index and query a corpus of documents in the bag-of-words format using the + cosine similarity: + + .. sourcecode:: pycon + + >>> from gensim.corpora import Dictionary + >>> from gensim.similarities import SparseMatrixSimilarity + >>> from gensim.test.utils import common_texts as corpus + >>> + >>> dictionary = Dictionary(corpus) # fit dictionary + >>> bow_corpus = [dictionary.doc2bow(line) for line in corpus] # convert corpus to BoW format + >>> index = SparseMatrixSimilarity(bm25_corpus, num_docs=len(corpus), num_terms=len(dictionary)) + >>> + >>> query = 'graph trees computer'.split() # make a query + >>> bow_query = dictionary.doc2bow(query) + >>> similarities = index[bow_query] # calculate similarity of query to each doc from bow_corpus + + Here is how you would index and query a corpus of documents using the Okapi BM25 scoring + function: + + .. sourcecode:: pycon + + >>> from gensim.corpora import Dictionary + >>> from gensim.models import TfidfModel, OkapiBM25Model + >>> from gensim.similarities import SparseMatrixSimilarity + >>> from gensim.test.utils import common_texts as corpus + >>> + >>> dictionary = Dictionary(corpus) # fit dictionary + >>> query_model = TfidfModel(dictionary=dictionary, smartirs='bnn') # enforce binary weights + >>> document_model = OkapiBM25Model(dictionary=dictionary) # fit bm25 model + >>> + >>> bow_corpus = [dictionary.doc2bow(line) for line in corpus] # convert corpus to BoW format + >>> bm25_corpus = document_model[bow_corpus] + >>> index = SparseMatrixSimilarity(bm25_corpus, num_docs=len(corpus), num_terms=len(dictionary), + ... normalize_queries=False, normalize_documents=False) + >>> + >>> + >>> query = 'graph trees computer'.split() # make a query + >>> bow_query = dictionary.doc2bow(query) + >>> bm25_query = query_model[bow_query] + >>> similarities = index[bm25_query] # calculate similarity of query to each doc from bow_corpus + Notes ----- Use this if your input corpus contains sparse vectors (such as TF-IDF documents) and fits into RAM. @@ -1122,7 +1180,8 @@ class SparseMatrixSimilarity(interfaces.SimilarityABC): """ def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, - num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): + num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False, + normalize_queries=True, normalize_documents=True): """ Parameters @@ -1146,10 +1205,15 @@ def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num Data type of the internal matrix. maintain_sparsity : bool, optional Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`? - + normalize_queries : bool, optional + If queries are in bag-of-words (int, float) format, as opposed to a sparse or dense + 2D arrays, they will be L2-normalized. Default is True. + normalize_documents : bool, optional + If `corpus` is in bag-of-words (int, float) format, as opposed to a sparse or dense + 2D arrays, it will be L2-normalized. Default is True. """ self.num_best = num_best - self.normalize = True + self.normalize = normalize_queries self.chunksize = chunksize self.maintain_sparsity = maintain_sparsity @@ -1173,7 +1237,7 @@ def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly") corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else - matutils.unitvec(v)) for v in corpus) + matutils.unitvec(v) if normalize_documents else v) for v in corpus) self.index = matutils.corpus2csc( corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, dtype=dtype, printprogress=10000, diff --git a/gensim/similarities/fastss.pyx b/gensim/similarities/fastss.pyx index e47a5442b2..fe0366bb04 100644 --- a/gensim/similarities/fastss.pyx +++ b/gensim/similarities/fastss.pyx @@ -43,9 +43,11 @@ cdef extern from *: void * s1_data = PyUnicode_DATA(s1); void * s2_data = PyUnicode_DATA(s2); - for (WIDTH tmpi = 0; tmpi <= len_s1; tmpi++) row2[tmpi] = tmpi; + WIDTH tmpi; + for (tmpi = 0; tmpi <= len_s1; tmpi++) row2[tmpi] = tmpi; - for (WIDTH i2 = 0; i2 < len_s2; i2++) { + WIDTH i2; + for (i2 = 0; i2 < len_s2; i2++) { int all_bad = i2 >= maximum; const Py_UCS4 ch = PyUnicode_READ(kind2, s2_data, i2); row_flip = 1 - row_flip; @@ -56,7 +58,8 @@ cdef extern from *: } *pos_new = i2 + 1; - for (WIDTH i1 = 0; i1 < len_s1; i1++) { + WIDTH i1; + for (i1 = 0; i1 < len_s1; i1++) { WIDTH val = *(pos_old++); if (ch != PyUnicode_READ(kind1, s1_data, i1)) { const WIDTH _val1 = *pos_old; diff --git a/gensim/test/test_bm25model.py b/gensim/test/test_bm25model.py new file mode 100644 index 0000000000..4cb9ca49ee --- /dev/null +++ b/gensim/test/test_bm25model.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from collections import defaultdict +import math +import unittest + +from gensim.models.bm25model import BM25ABC +from gensim.models import OkapiBM25Model, LuceneBM25Model, AtireBM25Model + +from gensim.corpora import Dictionary + + +class BM25Stub(BM25ABC): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def precompute_idfs(self, dfs, num_docs): + return dict() + + def get_term_weights(self, num_tokens, term_frequencies, idfs): + return term_frequencies + + +class BM25ABCTest(unittest.TestCase): + def setUp(self): + self.documents = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']] + self.dictionary = Dictionary(self.documents) + + self.expected_avgdl = sum(map(len, self.documents)) / len(self.documents) + + def test_avgdl_from_corpus(self): + corpus = list(map(self.dictionary.doc2bow, self.documents)) + model = BM25Stub(corpus=corpus) + actual_avgdl = model.avgdl + self.assertAlmostEqual(self.expected_avgdl, actual_avgdl) + + def test_avgdl_from_dictionary(self): + model = BM25Stub(dictionary=self.dictionary) + actual_avgdl = model.avgdl + self.assertAlmostEqual(self.expected_avgdl, actual_avgdl) + + +class OkapiBM25ModelTest(unittest.TestCase): + def setUp(self): + self.documents = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']] + self.dictionary = Dictionary(self.documents) + self.k1, self.b, self.epsilon = 1.5, 0.75, 0.25 + + def get_idf(word): + frequency = sum(map(lambda document: word in document, self.documents)) + return math.log((len(self.documents) - frequency + 0.5) / (frequency + 0.5)) + + dog_idf = get_idf('dog') + cat_idf = get_idf('cat') + mouse_idf = get_idf('mouse') + lion_idf = get_idf('lion') + + average_idf = (dog_idf + cat_idf + mouse_idf + lion_idf) / len(self.dictionary) + eps = self.epsilon * average_idf + + self.expected_dog_idf = dog_idf if dog_idf > 0 else eps + self.expected_cat_idf = cat_idf if cat_idf > 0 else eps + self.expected_mouse_idf = mouse_idf if mouse_idf > 0 else eps + self.expected_lion_idf = lion_idf if lion_idf > 0 else eps + + def test_idfs_from_corpus(self): + corpus = list(map(self.dictionary.doc2bow, self.documents)) + model = OkapiBM25Model(corpus=corpus, k1=self.k1, b=self.b, epsilon=self.epsilon) + + actual_dog_idf = model.idfs[self.dictionary.token2id['dog']] + actual_cat_idf = model.idfs[self.dictionary.token2id['cat']] + actual_mouse_idf = model.idfs[self.dictionary.token2id['mouse']] + actual_lion_idf = model.idfs[self.dictionary.token2id['lion']] + + self.assertAlmostEqual(self.expected_dog_idf, actual_dog_idf) + self.assertAlmostEqual(self.expected_cat_idf, actual_cat_idf) + self.assertAlmostEqual(self.expected_mouse_idf, actual_mouse_idf) + self.assertAlmostEqual(self.expected_lion_idf, actual_lion_idf) + + def test_idfs_from_dictionary(self): + model = OkapiBM25Model(dictionary=self.dictionary, k1=self.k1, b=self.b, epsilon=self.epsilon) + + actual_dog_idf = model.idfs[self.dictionary.token2id['dog']] + actual_cat_idf = model.idfs[self.dictionary.token2id['cat']] + actual_mouse_idf = model.idfs[self.dictionary.token2id['mouse']] + actual_lion_idf = model.idfs[self.dictionary.token2id['lion']] + + self.assertAlmostEqual(self.expected_dog_idf, actual_dog_idf) + self.assertAlmostEqual(self.expected_cat_idf, actual_cat_idf) + self.assertAlmostEqual(self.expected_mouse_idf, actual_mouse_idf) + self.assertAlmostEqual(self.expected_lion_idf, actual_lion_idf) + + def test_score(self): + model = OkapiBM25Model(dictionary=self.dictionary, k1=self.k1, b=self.b, epsilon=self.epsilon) + + first_document = self.documents[0] + first_bow = self.dictionary.doc2bow(first_document) + weights = defaultdict(lambda: 0.0) + weights.update(model[first_bow]) + + actual_dog_weight = weights[self.dictionary.token2id['dog']] + actual_cat_weight = weights[self.dictionary.token2id['cat']] + actual_mouse_weight = weights[self.dictionary.token2id['mouse']] + actual_lion_weight = weights[self.dictionary.token2id['lion']] + + def get_expected_weight(word): + idf = model.idfs[self.dictionary.token2id[word]] + numerator = self.k1 + 1 + denominator = 1 + self.k1 * (1 - self.b + self.b * len(first_document) / model.avgdl) + return idf * numerator / denominator + + expected_dog_weight = get_expected_weight('dog') if 'dog' in first_document else 0.0 + expected_cat_weight = get_expected_weight('cat') if 'cat' in first_document else 0.0 + expected_mouse_weight = get_expected_weight('mouse') if 'mouse' in first_document else 0.0 + expected_lion_weight = get_expected_weight('lion') if 'lion' in first_document else 0.0 + + self.assertAlmostEqual(expected_dog_weight, actual_dog_weight) + self.assertAlmostEqual(expected_cat_weight, actual_cat_weight) + self.assertAlmostEqual(expected_mouse_weight, actual_mouse_weight) + self.assertAlmostEqual(expected_lion_weight, actual_lion_weight) + + +class LuceneBM25ModelTest(unittest.TestCase): + def setUp(self): + self.documents = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']] + self.dictionary = Dictionary(self.documents) + self.k1, self.b = 1.5, 0.75 + + def get_idf(word): + frequency = sum(map(lambda document: word in document, self.documents)) + return math.log(1.0 + (len(self.documents) - frequency + 0.5) / (frequency + 0.5)) + + self.expected_dog_idf = get_idf('dog') + self.expected_cat_idf = get_idf('cat') + self.expected_mouse_idf = get_idf('mouse') + self.expected_lion_idf = get_idf('lion') + + def test_idfs_from_corpus(self): + corpus = list(map(self.dictionary.doc2bow, self.documents)) + model = LuceneBM25Model(corpus=corpus, k1=self.k1, b=self.b) + + actual_dog_idf = model.idfs[self.dictionary.token2id['dog']] + actual_cat_idf = model.idfs[self.dictionary.token2id['cat']] + actual_mouse_idf = model.idfs[self.dictionary.token2id['mouse']] + actual_lion_idf = model.idfs[self.dictionary.token2id['lion']] + + self.assertAlmostEqual(self.expected_dog_idf, actual_dog_idf) + self.assertAlmostEqual(self.expected_cat_idf, actual_cat_idf) + self.assertAlmostEqual(self.expected_mouse_idf, actual_mouse_idf) + self.assertAlmostEqual(self.expected_lion_idf, actual_lion_idf) + + def test_idfs_from_dictionary(self): + model = LuceneBM25Model(dictionary=self.dictionary, k1=self.k1, b=self.b) + + actual_dog_idf = model.idfs[self.dictionary.token2id['dog']] + actual_cat_idf = model.idfs[self.dictionary.token2id['cat']] + actual_mouse_idf = model.idfs[self.dictionary.token2id['mouse']] + actual_lion_idf = model.idfs[self.dictionary.token2id['lion']] + + self.assertAlmostEqual(self.expected_dog_idf, actual_dog_idf) + self.assertAlmostEqual(self.expected_cat_idf, actual_cat_idf) + self.assertAlmostEqual(self.expected_mouse_idf, actual_mouse_idf) + self.assertAlmostEqual(self.expected_lion_idf, actual_lion_idf) + + def test_score(self): + model = LuceneBM25Model(dictionary=self.dictionary, k1=self.k1, b=self.b) + + first_document = self.documents[0] + first_bow = self.dictionary.doc2bow(first_document) + weights = defaultdict(lambda: 0.0) + weights.update(model[first_bow]) + + actual_dog_weight = weights[self.dictionary.token2id['dog']] + actual_cat_weight = weights[self.dictionary.token2id['cat']] + actual_mouse_weight = weights[self.dictionary.token2id['mouse']] + actual_lion_weight = weights[self.dictionary.token2id['lion']] + + def get_expected_weight(word): + idf = model.idfs[self.dictionary.token2id[word]] + denominator = 1 + self.k1 * (1 - self.b + self.b * len(first_document) / model.avgdl) + return idf / denominator + + expected_dog_weight = get_expected_weight('dog') if 'dog' in first_document else 0.0 + expected_cat_weight = get_expected_weight('cat') if 'cat' in first_document else 0.0 + expected_mouse_weight = get_expected_weight('mouse') if 'mouse' in first_document else 0.0 + expected_lion_weight = get_expected_weight('lion') if 'lion' in first_document else 0.0 + + self.assertAlmostEqual(expected_dog_weight, actual_dog_weight) + self.assertAlmostEqual(expected_cat_weight, actual_cat_weight) + self.assertAlmostEqual(expected_mouse_weight, actual_mouse_weight) + self.assertAlmostEqual(expected_lion_weight, actual_lion_weight) + + +class AtireBM25ModelTest(unittest.TestCase): + def setUp(self): + self.documents = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']] + self.dictionary = Dictionary(self.documents) + self.k1, self.b, self.epsilon = 1.5, 0.75, 0.25 + + def get_idf(word): + frequency = sum(map(lambda document: word in document, self.documents)) + return math.log(len(self.documents) / frequency) + + self.expected_dog_idf = get_idf('dog') + self.expected_cat_idf = get_idf('cat') + self.expected_mouse_idf = get_idf('mouse') + self.expected_lion_idf = get_idf('lion') + + def test_idfs_from_corpus(self): + corpus = list(map(self.dictionary.doc2bow, self.documents)) + model = AtireBM25Model(corpus=corpus, k1=self.k1, b=self.b) + + actual_dog_idf = model.idfs[self.dictionary.token2id['dog']] + actual_cat_idf = model.idfs[self.dictionary.token2id['cat']] + actual_mouse_idf = model.idfs[self.dictionary.token2id['mouse']] + actual_lion_idf = model.idfs[self.dictionary.token2id['lion']] + + self.assertAlmostEqual(self.expected_dog_idf, actual_dog_idf) + self.assertAlmostEqual(self.expected_cat_idf, actual_cat_idf) + self.assertAlmostEqual(self.expected_mouse_idf, actual_mouse_idf) + self.assertAlmostEqual(self.expected_lion_idf, actual_lion_idf) + + def test_idfs_from_dictionary(self): + model = AtireBM25Model(dictionary=self.dictionary, k1=self.k1, b=self.b) + + actual_dog_idf = model.idfs[self.dictionary.token2id['dog']] + actual_cat_idf = model.idfs[self.dictionary.token2id['cat']] + actual_mouse_idf = model.idfs[self.dictionary.token2id['mouse']] + actual_lion_idf = model.idfs[self.dictionary.token2id['lion']] + + self.assertAlmostEqual(self.expected_dog_idf, actual_dog_idf) + self.assertAlmostEqual(self.expected_cat_idf, actual_cat_idf) + self.assertAlmostEqual(self.expected_mouse_idf, actual_mouse_idf) + self.assertAlmostEqual(self.expected_lion_idf, actual_lion_idf) + + def test_score(self): + model = AtireBM25Model(dictionary=self.dictionary, k1=self.k1, b=self.b) + + first_document = self.documents[0] + first_bow = self.dictionary.doc2bow(first_document) + weights = defaultdict(lambda: 0.0) + weights.update(model[first_bow]) + + actual_dog_weight = weights[self.dictionary.token2id['dog']] + actual_cat_weight = weights[self.dictionary.token2id['cat']] + actual_mouse_weight = weights[self.dictionary.token2id['mouse']] + actual_lion_weight = weights[self.dictionary.token2id['lion']] + + def get_expected_weight(word): + idf = model.idfs[self.dictionary.token2id[word]] + numerator = self.k1 + 1 + denominator = 1 + self.k1 * (1 - self.b + self.b * len(first_document) / model.avgdl) + return idf * numerator / denominator + + expected_dog_weight = get_expected_weight('dog') if 'dog' in first_document else 0.0 + expected_cat_weight = get_expected_weight('cat') if 'cat' in first_document else 0.0 + expected_mouse_weight = get_expected_weight('mouse') if 'mouse' in first_document else 0.0 + expected_lion_weight = get_expected_weight('lion') if 'lion' in first_document else 0.0 + + self.assertAlmostEqual(expected_dog_weight, actual_dog_weight) + self.assertAlmostEqual(expected_cat_weight, actual_cat_weight) + self.assertAlmostEqual(expected_mouse_weight, actual_mouse_weight) + self.assertAlmostEqual(expected_lion_weight, actual_lion_weight) diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 2b111f7306..9927851a93 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -305,6 +305,12 @@ def testCompareCoherenceForModels(self): self.assertAlmostEqual(np.mean(coherence_topics2), coherence2, 4) self.assertAlmostEqual(coherence1, coherence2, places=4) + def testEmptyList(self): + """Test if CoherenceModel works with document without tokens""" + texts = self.texts + [[]] + cm = CoherenceModel(model=self.ldamodel, texts=texts, coherence="c_v", processes=1) + cm.get_coherence() + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) diff --git a/gensim/test/test_data/model-from-gensim-3.8.0.w2v b/gensim/test/test_data/model-from-gensim-3.8.0.w2v new file mode 100644 index 0000000000..40f7e22e32 Binary files /dev/null and b/gensim/test/test_data/model-from-gensim-3.8.0.w2v differ diff --git a/gensim/test/test_dtm.py b/gensim/test/test_dtm.py deleted file mode 100644 index 0e57d15e7e..0000000000 --- a/gensim/test/test_dtm.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -""" -Automated tests for DTM/DIM model -""" - - -import logging -from subprocess import CalledProcessError -import gensim -import os -import unittest -from gensim import corpora -from gensim.test.utils import datapath - - -class TestDtmModel(unittest.TestCase): - - def setUp(self): - self.time_slices = [3, 7] - self.corpus = corpora.mmcorpus.MmCorpus(datapath('dtm_test.mm')) - self.id2word = corpora.Dictionary.load(datapath('dtm_test.dict')) - # first you need to setup the environment variable $DTM_PATH for the dtm executable file - self.dtm_path = os.environ.get('DTM_PATH', None) - if not self.dtm_path: - self.skipTest("$DTM_PATH is not properly set up.") - - def test_dtm(self): - if self.dtm_path is not None: - model = gensim.models.wrappers.DtmModel( - self.dtm_path, self.corpus, self.time_slices, num_topics=2, - id2word=self.id2word, model='dtm', initialize_lda=True, - rng_seed=1 - ) - topics = model.show_topics(num_topics=2, times=2, num_words=10) - self.assertEqual(len(topics), 4) - - one_topic = model.show_topic(topicid=1, time=1, topn=10) - self.assertEqual(len(one_topic), 10) - self.assertEqual(one_topic[0][1], u'idexx') - - def test_dim(self): - if self.dtm_path is not None: - model = gensim.models.wrappers.DtmModel( - self.dtm_path, self.corpus, self.time_slices, num_topics=2, - id2word=self.id2word, model='fixed', initialize_lda=True, - rng_seed=1 - ) - topics = model.show_topics(num_topics=2, times=2, num_words=10) - self.assertEqual(len(topics), 4) - - one_topic = model.show_topic(topicid=1, time=1, topn=10) - self.assertEqual(len(one_topic), 10) - self.assertEqual(one_topic[0][1], u'skills') - - # In stderr expect "Error opening file /tmp/a65419_train_out/initial-lda-ss.dat. Failing." - def test_called_process_error(self): - if self.dtm_path is not None: - with self.assertRaises(CalledProcessError): - gensim.models.wrappers.DtmModel( - self.dtm_path, self.corpus, self.time_slices, num_topics=2, - id2word=self.id2word, model='dtm', initialize_lda=False, - rng_seed=1 - ) - - -if __name__ == '__main__': - logging.basicConfig(level=logging.DEBUG) - unittest.main() diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index ecc44a30e4..e07eaab9a1 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -30,10 +30,10 @@ import gensim.models.fasttext try: - from pyemd import emd # noqa:F401 - PYEMD_EXT = True + from ot import emd2 # noqa:F401 + POT_EXT = True except (ImportError, ValueError): - PYEMD_EXT = False + POT_EXT = False logger = logging.getLogger(__name__) @@ -394,7 +394,7 @@ def test_contains(self): self.assertFalse('nights' in self.test_model.wv.key_to_index) self.assertTrue('nights' in self.test_model.wv) - @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed") + @unittest.skipIf(POT_EXT is False, "POT not installed") def test_wm_distance(self): doc = ['night', 'payment'] oov_doc = ['nights', 'forests', 'payments'] @@ -1782,6 +1782,28 @@ def test_identity(self): self.assertTrue(np.all(np.array([6, 7, 8]) == n[2])) +class FastTextKeyedVectorsTest(unittest.TestCase): + def test_add_vector(self): + wv = FastTextKeyedVectors(vector_size=2, min_n=3, max_n=6, bucket=2000000) + wv.add_vector("test_key", np.array([0, 0])) + + self.assertEqual(wv.key_to_index["test_key"], 0) + self.assertEqual(wv.index_to_key[0], "test_key") + self.assertTrue(np.all(wv.vectors[0] == np.array([0, 0]))) + + def test_add_vectors(self): + wv = FastTextKeyedVectors(vector_size=2, min_n=3, max_n=6, bucket=2000000) + wv.add_vectors(["test_key1", "test_key2"], np.array([[0, 0], [1, 1]])) + + self.assertEqual(wv.key_to_index["test_key1"], 0) + self.assertEqual(wv.index_to_key[0], "test_key1") + self.assertTrue(np.all(wv.vectors[0] == np.array([0, 0]))) + + self.assertEqual(wv.key_to_index["test_key2"], 1) + self.assertEqual(wv.index_to_key[1], "test_key2") + self.assertTrue(np.all(wv.vectors[1] == np.array([1, 1]))) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index 297006b75f..7ce675e337 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -33,7 +33,7 @@ def test_random_state(): testcases = [np.random.seed(0), None, np.random.RandomState(0), 0] for testcase in testcases: - assert(isinstance(utils.get_random_state(testcase), np.random.RandomState)) + assert isinstance(utils.get_random_state(testcase), np.random.RandomState) class TestLdaModel(unittest.TestCase, basetmtests.TestBaseTopicModel): diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 0b917980d2..a7fdfdf7bc 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -36,10 +36,10 @@ from gensim.similarities.fastss import editdist try: - from pyemd import emd # noqa:F401 - PYEMD_EXT = True + from ot import emd2 # noqa:F401 + POT_EXT = True except (ImportError, ValueError): - PYEMD_EXT = False + POT_EXT = False SENTENCES = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(TEXTS)] @@ -88,8 +88,8 @@ def test_full(self, num_best=None, shardsize=100): index.destroy() def test_num_best(self): - if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: - self.skipTest("pyemd not installed") + if self.cls == similarities.WmdSimilarity and not POT_EXT: + self.skipTest("POT not installed") for num_best in [None, 0, 1, 9, 1000]: self.testFull(num_best=num_best) @@ -119,8 +119,8 @@ def test_scipy2scipy_clipped(self): def test_empty_query(self): index = self.factoryMethod() - if isinstance(index, similarities.WmdSimilarity) and not PYEMD_EXT: - self.skipTest("pyemd not installed") + if isinstance(index, similarities.WmdSimilarity) and not POT_EXT: + self.skipTest("POT not installed") query = [] try: @@ -177,8 +177,8 @@ def test_iter(self): index.destroy() def test_persistency(self): - if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: - self.skipTest("pyemd not installed") + if self.cls == similarities.WmdSimilarity and not POT_EXT: + self.skipTest("POT not installed") fname = get_tmpfile('gensim_similarities.tst.pkl') index = self.factoryMethod() @@ -197,8 +197,8 @@ def test_persistency(self): self.assertEqual(index.num_best, index2.num_best) def test_persistency_compressed(self): - if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: - self.skipTest("pyemd not installed") + if self.cls == similarities.WmdSimilarity and not POT_EXT: + self.skipTest("POT not installed") fname = get_tmpfile('gensim_similarities.tst.pkl.gz') index = self.factoryMethod() @@ -217,8 +217,8 @@ def test_persistency_compressed(self): self.assertEqual(index.num_best, index2.num_best) def test_large(self): - if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: - self.skipTest("pyemd not installed") + if self.cls == similarities.WmdSimilarity and not POT_EXT: + self.skipTest("POT not installed") fname = get_tmpfile('gensim_similarities.tst.pkl') index = self.factoryMethod() @@ -239,8 +239,8 @@ def test_large(self): self.assertEqual(index.num_best, index2.num_best) def test_large_compressed(self): - if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: - self.skipTest("pyemd not installed") + if self.cls == similarities.WmdSimilarity and not POT_EXT: + self.skipTest("POT not installed") fname = get_tmpfile('gensim_similarities.tst.pkl.gz') index = self.factoryMethod() @@ -261,8 +261,8 @@ def test_large_compressed(self): self.assertEqual(index.num_best, index2.num_best) def test_mmap(self): - if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: - self.skipTest("pyemd not installed") + if self.cls == similarities.WmdSimilarity and not POT_EXT: + self.skipTest("POT not installed") fname = get_tmpfile('gensim_similarities.tst.pkl') index = self.factoryMethod() @@ -284,8 +284,8 @@ def test_mmap(self): self.assertEqual(index.num_best, index2.num_best) def test_mmap_compressed(self): - if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: - self.skipTest("pyemd not installed") + if self.cls == similarities.WmdSimilarity and not POT_EXT: + self.skipTest("POT not installed") fname = get_tmpfile('gensim_similarities.tst.pkl.gz') index = self.factoryMethod() @@ -310,7 +310,7 @@ def factoryMethod(self): # Override factoryMethod. return self.cls(TEXTS, self.w2v_model) - @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed") + @unittest.skipIf(POT_EXT is False, "POT not installed") def test_full(self, num_best=None): # Override testFull. @@ -329,7 +329,7 @@ def test_full(self, num_best=None): self.assertTrue(numpy.alltrue(sims[1:] > 0.0)) self.assertTrue(numpy.alltrue(sims[1:] < 1.0)) - @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed") + @unittest.skipIf(POT_EXT is False, "POT not installed") def test_non_increasing(self): ''' Check that similarities are non-increasing when `num_best` is not `None`.''' @@ -345,7 +345,7 @@ def test_non_increasing(self): cond = sum(numpy.diff(sims2) < 0) == len(sims2) - 1 self.assertTrue(cond) - @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed") + @unittest.skipIf(POT_EXT is False, "POT not installed") def test_chunking(self): # Override testChunking. @@ -364,7 +364,7 @@ def test_chunking(self): self.assertTrue(numpy.alltrue(sim > 0.0)) self.assertTrue(numpy.alltrue(sim <= 1.0)) - @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed") + @unittest.skipIf(POT_EXT is False, "POT not installed") def test_iter(self): # Override testIter. @@ -373,7 +373,7 @@ def test_iter(self): self.assertTrue(numpy.alltrue(sims >= 0.0)) self.assertTrue(numpy.alltrue(sims <= 1.0)) - @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed") + @unittest.skipIf(POT_EXT is False, "POT not installed") def test_str(self): index = self.cls(TEXTS, self.w2v_model) self.assertTrue(str(index)) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 7e58275208..a07cf08b10 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -21,10 +21,10 @@ from testfixtures import log_capture try: - from pyemd import emd # noqa:F401 - PYEMD_EXT = True + from ot import emd2 # noqa:F401 + POT_EXT = True except (ImportError, ValueError): - PYEMD_EXT = False + POT_EXT = False from gensim import utils from gensim.models import word2vec, keyedvectors @@ -275,6 +275,13 @@ def test_persistence(self): self.assertTrue(np.allclose(wv.vectors, loaded_wv.vectors)) self.assertEqual(len(wv), len(loaded_wv)) + def test_persistence_backwards_compatible(self): + """Can we still load a model created with an older gensim version?""" + path = datapath('model-from-gensim-3.8.0.w2v') + model = word2vec.Word2Vec.load(path) + x = model.score(['test']) + assert x is not None + def test_persistence_from_file(self): """Test storing/loading the entire model trained with corpus_file argument.""" with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: @@ -1084,7 +1091,7 @@ def test_negative_ns_exp(self): class TestWMD(unittest.TestCase): - @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed") + @unittest.skipIf(POT_EXT is False, "POT not installed") def test_nonzero(self): '''Test basic functionality with a test sentence.''' @@ -1096,7 +1103,7 @@ def test_nonzero(self): # Check that distance is non-zero. self.assertFalse(distance == 0.0) - @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed") + @unittest.skipIf(POT_EXT is False, "POT not installed") def test_symmetry(self): '''Check that distance is symmetric.''' @@ -1107,7 +1114,7 @@ def test_symmetry(self): distance2 = model.wv.wmdistance(sentence2, sentence1) self.assertTrue(np.allclose(distance1, distance2)) - @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed") + @unittest.skipIf(POT_EXT is False, "POT not installed") def test_identical_sentences(self): '''Check that the distance from a sentence to itself is zero.''' diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 2c06185a0b..58bdc2c35f 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -293,7 +293,8 @@ def accumulate(self, texts, window_size): relevant_texts, window_size, ignore_below_size=False, include_doc_num=True) for doc_num, virtual_document in windows: - self.analyze_text(virtual_document, doc_num) + if len(virtual_document) > 0: + self.analyze_text(virtual_document, doc_num) self.num_docs += 1 return self diff --git a/gensim/utils.py b/gensim/utils.py index 78d64b88e6..0619296888 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1863,7 +1863,7 @@ def keep_vocab_item(word, count, min_count, trim_rule=None): def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs): r"""Run OS command with the given arguments and return its output as a byte string. - Backported from Python 2.7 with a few minor modifications. Widely used for :mod:`gensim.models.wrappers`. + Backported from Python 2.7 with a few minor modifications. Used in word2vec/glove2word2vec tests. Behaves very similar to https://docs.python.org/2/library/subprocess.html#subprocess.check_output. Examples diff --git a/release/hijack_pr.py b/release/hijack_pr.py index d109836d49..fb326fff7c 100755 --- a/release/hijack_pr.py +++ b/release/hijack_pr.py @@ -18,6 +18,14 @@ The above commands would check out the code for the PR, make changes to them, and push them back. Obviously, this requires the PR to be writable, but most gensim PRs are. If they aren't, then leave it up to the PR author to make the required changes. + +Sometimes, we'll make upstream changes that we want to merge into existing PRs. +This is particularly useful when some nagging build problem is affecting multiple PRs. +We can achieve this with: + + $ release/hijack_pr.py merge-upstream-into 1234 + +This hijacks the PR and merges upstream/develop into it. """ import json import subprocess @@ -25,47 +33,67 @@ import smart_open + def check_output(command): return subprocess.check_output(command).strip().decode('utf-8') -if sys.argv[1] == "push": +def push(): command = "git rev-parse --abbrev-ref HEAD@{upstream}".split() remote, remote_branch = check_output(command).split('/') current_branch = check_output(['git', 'branch', '--show-current']) - check_output(['git', 'push', remote, f'{current_branch}:{remote_branch}']) + subprocess.check_call(['git', 'push', remote, f'{current_branch}:{remote_branch}']) # # Cleanup to prevent remotes and branches from piling up # - check_output(['git', 'branch', '--delete', current_branch]) - check_output(['git', 'remote', 'remove', remote]) - sys.exit(0) + subprocess.check_call(['git', 'checkout', 'develop']) + subprocess.check_call(['git', 'branch', '--delete', current_branch]) + subprocess.check_call(['git', 'remote', 'remove', remote]) + + +def hijack(prid): + url = f"https://api.github.com/repos/RaRe-Technologies/gensim/pulls/{prid}" + with smart_open.open(url) as fin: + prinfo = json.load(fin) + + user = prinfo['head']['user']['login'] + ssh_url = prinfo['head']['repo']['ssh_url'] -prid = int(sys.argv[1]) -url = f"https://api.github.com/repos/RaRe-Technologies/gensim/pulls/{prid}" -with smart_open.open(url) as fin: - prinfo = json.load(fin) + remotes = check_output(['git', 'remote']).split('\n') + if user not in remotes: + subprocess.check_call(['git', 'remote', 'add', user, ssh_url]) -user = prinfo['head']['user']['login'] -ssh_url = prinfo['head']['repo']['ssh_url'] + subprocess.check_call(['git', 'fetch', user]) -remotes = check_output(['git', 'remote']).split('\n') -if user not in remotes: - subprocess.check_call(['git', 'remote', 'add', user, ssh_url]) + ref = prinfo['head']['ref'] + subprocess.check_call(['git', 'checkout', f'{user}/{ref}']) + + # + # Prefix the local branch name with the user to avoid naming clashes with + # existing branches, e.g. develop + # + subprocess.check_call(['git', 'switch', '-c', f'{user}_{ref}']) + + # + # Set the upstream so we can push back to it more easily + # + subprocess.check_call(['git', 'branch', '--set-upstream-to', f'{user}/{ref}']) -subprocess.check_call(['git', 'fetch', user]) -ref = prinfo['head']['ref'] -subprocess.check_call(['git', 'checkout', f'{user}/{ref}']) +def main(): + if sys.argv[1] == "push": + push() + elif sys.argv[1] == 'merge-upstream-into': + prid = int(sys.argv[2]) + hijack(prid) + subprocess.check_call(['git', 'fetch', 'upstream']) + subprocess.check_call(['git', 'merge', 'upstream/develop', '--no-edit']) + push() + else: + prid = int(sys.argv[1]) + hijack(prid) -# -# Prefix the local branch name with the user to avoid naming clashes with -# existing branches, e.g. develop -# -subprocess.check_call(['git', 'switch', '-c', f'{user}_{ref}']) -# -# Set the upstream so we can push back to it more easily -# -subprocess.check_call(['git', 'branch', '--set-upstream-to', f'{user}/{ref}']) +if __name__ == '__main__': + main() diff --git a/release/upload_docs.sh b/release/upload_docs.sh index d454eaa157..3bec935a8e 100644 --- a/release/upload_docs.sh +++ b/release/upload_docs.sh @@ -1,3 +1,4 @@ -tox -e compile,docs +python setup.py build_ext --inplace cd docs/src +make html make upload diff --git a/requirements_docs.txt b/requirements_docs.txt index 3e41db0927..dc8b44b173 100644 --- a/requirements_docs.txt +++ b/requirements_docs.txt @@ -5,7 +5,7 @@ memory-profiler==0.55.0 nltk==3.4.5 nmslib==2.1.1 pandas==1.2.3 -pyemd==0.5.1 +POT==0.8.1 scikit-learn==0.24.1 sphinx-gallery==0.8.2 sphinxcontrib-napoleon==0.7 diff --git a/setup.py b/setup.py index e3ee0c3bdb..deace40c59 100644 --- a/setup.py +++ b/setup.py @@ -15,25 +15,26 @@ import platform import shutil import sys +from collections import OrderedDict from setuptools import Extension, find_packages, setup, distutils from setuptools.command.build_ext import build_ext -c_extensions = { - 'gensim.models.word2vec_inner': 'gensim/models/word2vec_inner.c', - 'gensim.corpora._mmreader': 'gensim/corpora/_mmreader.c', - 'gensim.models.fasttext_inner': 'gensim/models/fasttext_inner.c', - 'gensim._matutils': 'gensim/_matutils.c', - 'gensim.models.nmf_pgd': 'gensim/models/nmf_pgd.c', - 'gensim.similarities.fastss': 'gensim/similarities/fastss.c', -} +c_extensions = OrderedDict([ + ('gensim.models.word2vec_inner', 'gensim/models/word2vec_inner.c'), + ('gensim.corpora._mmreader', 'gensim/corpora/_mmreader.c'), + ('gensim.models.fasttext_inner', 'gensim/models/fasttext_inner.c'), + ('gensim._matutils', 'gensim/_matutils.c'), + ('gensim.models.nmf_pgd', 'gensim/models/nmf_pgd.c'), + ('gensim.similarities.fastss', 'gensim/similarities/fastss.c'), +]) -cpp_extensions = { - 'gensim.models.doc2vec_inner': 'gensim/models/doc2vec_inner.cpp', - 'gensim.models.word2vec_corpusfile': 'gensim/models/word2vec_corpusfile.cpp', - 'gensim.models.fasttext_corpusfile': 'gensim/models/fasttext_corpusfile.cpp', - 'gensim.models.doc2vec_corpusfile': 'gensim/models/doc2vec_corpusfile.cpp', -} +cpp_extensions = OrderedDict([ + ('gensim.models.doc2vec_inner', 'gensim/models/doc2vec_inner.cpp'), + ('gensim.models.word2vec_corpusfile', 'gensim/models/word2vec_corpusfile.cpp'), + ('gensim.models.fasttext_corpusfile', 'gensim/models/fasttext_corpusfile.cpp'), + ('gensim.models.doc2vec_corpusfile', 'gensim/models/doc2vec_corpusfile.cpp'), +]) def need_cython(): @@ -95,24 +96,32 @@ class CustomBuildExt(build_ext): """Custom build_ext action with bootstrapping. We need this in order to use numpy and Cython in this script without - importing them at module level, because they may not be available yet. + importing them at module level, because they may not be available at that time. """ - # - # http://stackoverflow.com/questions/19919905/how-to-bootstrap-numpy-installation-in-setup-py - # def finalize_options(self): build_ext.finalize_options(self) - # Prevent numpy from thinking it is still in its setup process: - # https://docs.python.org/2/library/__builtin__.html#module-__builtin__ - __builtins__.__NUMPY_SETUP__ = False + import builtins import numpy + + # + # Prevent numpy from thinking it is still in its setup process + # http://stackoverflow.com/questions/19919905/how-to-bootstrap-numpy-installation-in-setup-py + # + # Newer numpy versions don't support this hack, nor do they need it. + # https://github.com/pyvista/pyacvd/pull/23#issue-1298467701 + # + try: + builtins.__NUMPY_SETUP__ = False + except Exception as ex: + print(f'could not use __NUMPY_SETUP__ hack (numpy version: {numpy.__version__}): {ex}') + self.include_dirs.append(numpy.get_include()) if need_cython(): import Cython.Build - Cython.Build.cythonize(list(make_c_ext(use_cython=True))) - Cython.Build.cythonize(list(make_cpp_ext(use_cython=True))) + Cython.Build.cythonize(list(make_c_ext(use_cython=True)), language_level=3) + Cython.Build.cythonize(list(make_cpp_ext(use_cython=True)), language_level=3) class CleanExt(distutils.cmd.Command): @@ -272,15 +281,14 @@ def run(self): 'testfixtures', ] -if not (sys.platform.lower().startswith("win") and sys.version_info[:2] >= (3, 9)): +if not sys.platform.lower().startswith("win") and sys.version_info[:2] < (3, 11): core_testenv.extend([ - 'pyemd', + 'POT', 'nmslib', ]) # Add additional requirements for testing on Linux that are skipped on Windows. linux_testenv = core_testenv[:] + visdom_req - # Skip problematic/uninstallable packages (& thus related conditional tests) in Windows builds. # We still test them in Linux via Travis, see linux_testenv above. # See https://github.com/RaRe-Technologies/gensim/pull/2814 @@ -297,11 +305,17 @@ def run(self): # https://packaging.python.org/discussions/install-requires-vs-requirements/ # +# +# We pin the Sphinx-related packages to specific versions here because we want +# our documentation builds to be reproducible. Different versions of Sphinx +# can generate slightly different output, and because we keep some of the output +# under version control, we want to keep these differences to a minimum. +# docs_testenv = core_testenv + distributed_env + visdom_req + [ - 'sphinx', - 'sphinx-gallery', - 'sphinxcontrib.programoutput', - 'sphinxcontrib-napoleon', + 'sphinx==5.1.1', + 'sphinx-gallery==0.11.1', + 'sphinxcontrib.programoutput==0.17', + 'sphinxcontrib-napoleon==0.7', 'matplotlib', # expected by sphinx-gallery 'memory_profiler', 'annoy', @@ -312,18 +326,22 @@ def run(self): 'pandas', ] -NUMPY_STR = 'numpy >= 1.17.0' +NUMPY_STR = 'numpy >= 1.18.5' # # We pin the Cython version for reproducibility. We expect our extensions # to build with any sane version of Cython, so we should update this pin # periodically. # -CYTHON_STR = 'Cython==0.29.28' +CYTHON_STR = 'Cython==0.29.32' + +# Allow overriding the Cython version requirement +CYTHON_STR = os.environ.get('GENSIM_CYTHON_REQUIRES', CYTHON_STR) install_requires = [ NUMPY_STR, - 'scipy >= 0.18.1', + 'scipy >= 1.7.0', 'smart_open >= 1.8.1', + 'FuzzyTM >= 0.4.0' ] setup_requires = [NUMPY_STR] @@ -334,7 +352,7 @@ def run(self): setup( name='gensim', - version='4.2.0', + version='4.3.0', description='Python framework for fast Vector Space Modelling', long_description=LONG_DESCRIPTION, @@ -367,9 +385,10 @@ def run(self): 'Environment :: Console', 'Intended Audience :: Science/Research', 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3 :: Only', 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Scientific/Engineering :: Information Analysis', @@ -377,7 +396,7 @@ def run(self): ], test_suite="gensim.test", - python_requires='>=3.6', + python_requires='>=3.8', setup_requires=setup_requires, install_requires=install_requires, tests_require=linux_testenv,