heuristics to get title from section headers #3536
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Testing | |
on: | |
push: | |
branches: | |
- main | |
pull_request: | |
# Can't filter on paths easily and have required workflows. | |
# See https://github.com/orgs/community/discussions/13690 and | |
# https://engineering.mixpanel.com/enforcing-required-checks-on-conditional-ci-jobs-in-a-github-monorepo-8d4949694340 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} | |
SYCAMORE_S3_TEMP_PATH: s3://aryn-sycamore-integ-temp/ | |
MODEL_SERVER_KEY: ${{ secrets.MODEL_SERVER_KEY }} | |
PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} | |
ARYN_API_KEY: ${{ secrets.MODEL_SERVER_KEY }} | |
SYCAMORE_HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }} | |
SYCAMORE_OPENAI_USER: "sycamore-github-ci" | |
SYCAMORE_HELICONE_TAG: ${{ github.ref }} | |
OS_USER_NAME: ${{ secrets.OS_USER_NAME }} | |
OS_PASSWORD: ${{ secrets.OS_PASSWORD }} | |
# Permissions for AWS access | |
permissions: | |
id-token: write # This is required for requesting the JWT | |
contents: read # This is required for actions/checkout | |
jobs: | |
sycamore-unit-tests: | |
runs-on: ubuntu-latest | |
strategy: | |
matrix: | |
python-version: ["3.9", "3.10", "3.11", "3.12"] | |
steps: | |
- name: DF-1 | |
run: df | |
- name: du-runner-initial | |
run: du -kx /home/runner | sort -rn | head -20 | |
- name: Move cache to /mnt | |
run: sudo mkdir /mnt/cache && sudo chown $(whoami) /mnt/cache && mkdir -p /home/runner/.cache && sudo mount -o bind /mnt/cache /home/runner/.cache | |
# Could free up other stuff as in: | |
# https://github.com/easimon/maximize-build-space/blob/master/action.yml | |
- name: Free up disk space | |
run: sudo rm -rf /usr/local/lib/android | |
- name: DF-2 | |
run: df | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Install poetry | |
run: pipx install poetry | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
cache: "poetry" | |
- name: DF-3 | |
run: df | |
- name: Install sycamore | |
run: poetry install --all-extras | |
working-directory: lib/sycamore | |
- name: Download nltk packages | |
run: poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng | |
working-directory: lib/sycamore | |
- name: DF-4 | |
run: df | |
- name: Update Apt | |
run: sudo apt-get update | |
- name: Install apt dependencies | |
run: sudo apt-get install -y poppler-utils tesseract-ocr libreoffice | |
- name: DF-5 | |
run: df | |
- name: Run tests | |
run: poetry run pytest -n auto sycamore/tests/unit/ | |
working-directory: lib/sycamore | |
- name: Run more tests | |
run: poetry run python sycamore/tests/manual/test_fast_sycamore_import.py | |
working-directory: lib/sycamore | |
- name: DF-6 | |
run: df | |
rps-unit-test: | |
runs-on: blacksmith-4vcpu-ubuntu-2204 | |
strategy: | |
matrix: | |
python-version: ["3.9", "3.10", "3.11"] | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Install Poetry | |
run: pipx install poetry | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: useblacksmith/setup-python@v6 | |
with: | |
python-version: ${{ matrix.python-version }} | |
cache: "poetry" | |
- name: Install Protoc | |
uses: arduino/setup-protoc@v3 | |
with: | |
repo-token: ${{ secrets.GITHUB_TOKEN }} | |
- name: Install RPS | |
run: make install_rps | |
working-directory: lib/remote-processors | |
- name: Run Tests | |
run: poetry run pytest remote_processors/test/unit/ | |
working-directory: lib/remote-processors | |
notebook-tests-fast: | |
runs-on: blacksmith-8vcpu-ubuntu-2204 | |
strategy: | |
matrix: | |
python-version: ["3.9"] | |
services: | |
opensearch: | |
image: opensearchproject/opensearch:2.10.0 | |
env: | |
discovery.type: "single-node" | |
ports: | |
- 9200:9200 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Install poetry | |
run: pipx install poetry | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
cache: "poetry" | |
- name: Install sycamore | |
run: poetry install --all-extras | |
- name: Download nltk packages | |
run: poetry run python -m nltk.downloader punkt averaged_perceptron_tagger | |
- name: Update Apt | |
run: sudo apt-get update | |
- name: Install poppler and tesseract | |
run: sudo apt-get install -y poppler-utils tesseract-ocr | |
- name: Configure AWS Credentials via OIDC provider. | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
aws-region: us-east-1 | |
role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ | |
- name: Run Notebook tests | |
run: ./notebooks/run-notebook-tests.sh --fast | |
integ-tests: | |
runs-on: blacksmith-8vcpu-ubuntu-2204 | |
strategy: | |
matrix: | |
python-version: ["3.9"] | |
services: | |
opensearch: | |
image: opensearchproject/opensearch:2.10.0 | |
env: | |
discovery.type: "single-node" | |
ports: | |
- 9200:9200 | |
elasticsearch: | |
image: elasticsearch:8.14.2 | |
env: | |
discovery.type: "single-node" | |
xpack.security.http.ssl.enabled: "false" | |
xpack.security.enabled: "false" | |
bootstrap.memory_lock: "true" | |
ES_JAVA_OPTS: "-Xms2g -Xmx2g" | |
ports: | |
- 9201:9200 | |
neo4j: | |
image: neo4j:5.21.0 | |
env: | |
NEO4J_dbms_memory_heap_initial__size: 2G | |
NEO4J_dbms_memory_heap_max__size: 2G | |
NEO4J_dbms_security_auth__enabled: false | |
NEO4J_apoc_export_file_enabled: true | |
NEO4J_apoc_import_file_enabled: true | |
NEO4J_apoc_import_file_use__neo4j__config: true | |
NEO4JLABS_PLUGINS: '["apoc"]' | |
options: >- | |
--name neo4j | |
--volume /neo4j/import:/var/lib/neo4j/import | |
ports: | |
- 7474:7474 | |
- 7687:7687 | |
qdrant: | |
image: qdrant/qdrant:v1.11.4 | |
ports: | |
- 6333:6333 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Install poetry | |
run: pipx install poetry | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
cache: "poetry" | |
- name: Install sycamore | |
run: poetry install --all-extras | |
- name: Download nltk packages | |
run: poetry run python -m nltk.downloader punkt_tab averaged_perceptron_tagger_eng | |
- name: Update Apt | |
run: sudo apt-get update | |
- name: Install poppler and tesseract | |
run: sudo apt-get install -y poppler-utils tesseract-ocr | |
- name: Configure AWS Credentials via OIDC provider. | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
aws-region: us-east-1 | |
role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ | |
- name: Allow Read/Write permissions to Neo4j Import Directory | |
run: sudo chmod -R 777 /neo4j/import | |
- name: Run Integ tests | |
run: poetry run pytest lib/sycamore/sycamore/tests/integration | |
notebook-tests-slow: | |
runs-on: blacksmith-8vcpu-ubuntu-2204 | |
strategy: | |
matrix: | |
python-version: ["3.9"] | |
services: | |
opensearch: | |
image: opensearchproject/opensearch:2.10.0 | |
env: | |
discovery.type: "single-node" | |
ports: | |
- 9200:9200 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Install poetry | |
run: pipx install poetry | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
cache: "poetry" | |
- name: Install sycamore | |
run: poetry install --all-extras | |
- name: Download nltk packages | |
run: poetry run python -m nltk.downloader punkt averaged_perceptron_tagger | |
- name: Update Apt | |
run: sudo apt-get update | |
- name: Install poppler and tesseract | |
run: sudo apt-get install -y poppler-utils tesseract-ocr | |
- name: Configure AWS Credentials via OIDC provider. | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
aws-region: us-east-1 | |
role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ | |
- name: Run Notebook tests | |
run: ./notebooks/run-notebook-tests.sh --slow | |
notebook-tests-docprep: | |
runs-on: blacksmith-8vcpu-ubuntu-2204 | |
strategy: | |
matrix: | |
python-version: ["3.9"] | |
services: | |
opensearch: | |
image: opensearchproject/opensearch:2.10.0 | |
env: | |
discovery.type: "single-node" | |
ports: | |
- 9200:9200 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Install poetry | |
run: pipx install poetry | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v4 | |
with: | |
python-version: ${{ matrix.python-version }} | |
cache: "poetry" | |
- name: Install sycamore | |
run: poetry install --all-extras | |
- name: Download nltk packages | |
run: poetry run python -m nltk.downloader punkt averaged_perceptron_tagger | |
- name: Update Apt | |
run: sudo apt-get update | |
- name: Install poppler and tesseract | |
run: sudo apt-get install -y poppler-utils tesseract-ocr | |
- name: Configure AWS Credentials via OIDC provider. | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
aws-region: us-east-1 | |
role-to-assume: arn:aws:iam::237550789389:role/aryn-github-integ | |
- name: Run DocPrep Notebook Tests | |
run: ./notebooks/run-notebook-tests.sh --docprep |