Skip to content

heuristics to get title from section headers #3536

heuristics to get title from section headers

heuristics to get title from section headers #3536

Workflow file for this run

name: Testing
on:
push:
branches:
- main
pull_request:
# Can't filter on paths easily and have required workflows.
# See https://github.com/orgs/community/discussions/13690 and
# https://engineering.mixpanel.com/enforcing-required-checks-on-conditional-ci-jobs-in-a-github-monorepo-8d4949694340
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
SYCAMORE_S3_TEMP_PATH: s3://aryn-sycamore-integ-temp/
MODEL_SERVER_KEY: ${{ secrets.MODEL_SERVER_KEY }}
PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
ARYN_API_KEY: ${{ secrets.MODEL_SERVER_KEY }}
SYCAMORE_HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }}
SYCAMORE_OPENAI_USER: "sycamore-github-ci"
SYCAMORE_HELICONE_TAG: ${{ github.ref }}
OS_USER_NAME: ${{ secrets.OS_USER_NAME }}
OS_PASSWORD: ${{ secrets.OS_PASSWORD }}
# Permissions for AWS access
permissions:
id-token: write # This is required for requesting the JWT
contents: read # This is required for actions/checkout
jobs:
sycamore-unit-tests:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- name: DF-1
run: df
- name: du-runner-initial
run: du -kx /home/runner | sort -rn | head -20
- name: Move cache to /mnt
run: sudo mkdir /mnt/cache && sudo chown $(whoami) /mnt/cache && mkdir -p /home/runner/.cache && sudo mount -o bind /mnt/cache /home/runner/.cache
# Could free up other stuff as in:
# https://github.com/easimon/maximize-build-space/blob/master/action.yml
- name: Free up disk space
run: sudo rm -rf /usr/local/lib/android
- name: DF-2
run: df
- name: Checkout
uses: actions/checkout@v4
- name: Install poetry
run: pipx install poetry
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: "poetry"
- name: DF-3
run: df
- name: Install sycamore
run: poetry install --all-extras
working-directory: lib/sycamore
- name: Download nltk packages
run: poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng
working-directory: lib/sycamore
- name: DF-4
run: df
- name: Update Apt
run: sudo apt-get update
- name: Install apt dependencies
run: sudo apt-get install -y poppler-utils tesseract-ocr libreoffice
- name: DF-5
run: df
- name: Run tests
run: poetry run pytest -n auto sycamore/tests/unit/
working-directory: lib/sycamore
- name: Run more tests
run: poetry run python sycamore/tests/manual/test_fast_sycamore_import.py
working-directory: lib/sycamore
- name: DF-6
run: df
rps-unit-test:
runs-on: blacksmith-4vcpu-ubuntu-2204
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11"]
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
- name: Install Poetry
run: pipx install poetry
- name: Set up Python ${{ matrix.python-version }}
uses: useblacksmith/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
cache: "poetry"
- name: Install Protoc
uses: arduino/setup-protoc@v3
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
- name: Install RPS
run: make install_rps
working-directory: lib/remote-processors
- name: Run Tests
run: poetry run pytest remote_processors/test/unit/
working-directory: lib/remote-processors
notebook-tests-fast:
runs-on: blacksmith-8vcpu-ubuntu-2204
strategy:
matrix:
python-version: ["3.9"]
services:
opensearch:
image: opensearchproject/opensearch:2.10.0
env:
discovery.type: "single-node"
ports:
- 9200:9200
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install poetry
run: pipx install poetry
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: "poetry"
- name: Install sycamore
run: poetry install --all-extras
- name: Download nltk packages
run: poetry run python -m nltk.downloader punkt averaged_perceptron_tagger
- name: Update Apt
run: sudo apt-get update
- name: Install poppler and tesseract
run: sudo apt-get install -y poppler-utils tesseract-ocr
- name: Configure AWS Credentials via OIDC provider.
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1
role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ
- name: Run Notebook tests
run: ./notebooks/run-notebook-tests.sh --fast
integ-tests:
runs-on: blacksmith-8vcpu-ubuntu-2204
strategy:
matrix:
python-version: ["3.9"]
services:
opensearch:
image: opensearchproject/opensearch:2.10.0
env:
discovery.type: "single-node"
ports:
- 9200:9200
elasticsearch:
image: elasticsearch:8.14.2
env:
discovery.type: "single-node"
xpack.security.http.ssl.enabled: "false"
xpack.security.enabled: "false"
bootstrap.memory_lock: "true"
ES_JAVA_OPTS: "-Xms2g -Xmx2g"
ports:
- 9201:9200
neo4j:
image: neo4j:5.21.0
env:
NEO4J_dbms_memory_heap_initial__size: 2G
NEO4J_dbms_memory_heap_max__size: 2G
NEO4J_dbms_security_auth__enabled: false
NEO4J_apoc_export_file_enabled: true
NEO4J_apoc_import_file_enabled: true
NEO4J_apoc_import_file_use__neo4j__config: true
NEO4JLABS_PLUGINS: '["apoc"]'
options: >-
--name neo4j
--volume /neo4j/import:/var/lib/neo4j/import
ports:
- 7474:7474
- 7687:7687
qdrant:
image: qdrant/qdrant:v1.11.4
ports:
- 6333:6333
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install poetry
run: pipx install poetry
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: "poetry"
- name: Install sycamore
run: poetry install --all-extras
- name: Download nltk packages
run: poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng
- name: Update Apt
run: sudo apt-get update
- name: Install poppler and tesseract
run: sudo apt-get install -y poppler-utils tesseract-ocr
- name: Configure AWS Credentials via OIDC provider.
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1
role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ
- name: Allow Read/Write permissions to Neo4j Import Directory
run: sudo chmod -R 777 /neo4j/import
- name: Run Integ tests
run: poetry run pytest lib/sycamore/sycamore/tests/integration
notebook-tests-slow:
runs-on: blacksmith-8vcpu-ubuntu-2204
strategy:
matrix:
python-version: ["3.9"]
services:
opensearch:
image: opensearchproject/opensearch:2.10.0
env:
discovery.type: "single-node"
ports:
- 9200:9200
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install poetry
run: pipx install poetry
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: "poetry"
- name: Install sycamore
run: poetry install --all-extras
- name: Download nltk packages
run: poetry run python -m nltk.downloader punkt averaged_perceptron_tagger
- name: Update Apt
run: sudo apt-get update
- name: Install poppler and tesseract
run: sudo apt-get install -y poppler-utils tesseract-ocr
- name: Configure AWS Credentials via OIDC provider.
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1
role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ
- name: Run Notebook tests
run: ./notebooks/run-notebook-tests.sh --slow
notebook-tests-docprep:
runs-on: blacksmith-8vcpu-ubuntu-2204
strategy:
matrix:
python-version: ["3.9"]
services:
opensearch:
image: opensearchproject/opensearch:2.10.0
env:
discovery.type: "single-node"
ports:
- 9200:9200
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install poetry
run: pipx install poetry
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: "poetry"
- name: Install sycamore
run: poetry install --all-extras
- name: Download nltk packages
run: poetry run python -m nltk.downloader punkt averaged_perceptron_tagger
- name: Update Apt
run: sudo apt-get update
- name: Install poppler and tesseract
run: sudo apt-get install -y poppler-utils tesseract-ocr
- name: Configure AWS Credentials via OIDC provider.
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1
role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ
- name: Run DocPrep Notebook Tests
run: ./notebooks/run-notebook-tests.sh --docprep