Skip to content

Commit

Permalink
Merge pull request #215 from GenSpectrum/ndjson-preprocessing
Browse files Browse the repository at this point in the history
Redesigned preprocessing
  • Loading branch information
Taepper authored Dec 18, 2023
2 parents a795602 + 619035b commit 14ed389
Show file tree
Hide file tree
Showing 120 changed files with 4,032 additions and 2,179 deletions.
41 changes: 23 additions & 18 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,49 +105,54 @@ jobs:
endToEndTests:
name: Run End To End Tests
runs-on: ubuntu-latest
if: ${{ github.event_name == 'push' }}
strategy:
matrix:
preprocessing-docker-compose: [
docker-compose-for-tests-preprocessing-from-tsv.yml,
docker-compose-for-tests-preprocessing-from-ndjson.yml
]
steps:
-
uses: actions/checkout@v4
-
name: Use Node.js ${{ matrix.node-version }}
- uses: actions/checkout@v4

- name: Use Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v4
with:
node-version: 18.x
-
uses: actions/cache@v3

- uses: actions/cache@v3
with:
path: ~/.npm
key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
-
name: npm install

- name: npm install
run: cd endToEndTests && npm ci
-
name: Check Format

- name: Check Format
run: cd endToEndTests && npm run check-format
-
name: Docker Metadata

- name: Docker Metadata
id: dockerMetadata
uses: docker/metadata-action@v5
with:
images: ${{ env.DOCKER_IMAGE_NAME }}
tags: type=ref,event=branch
-
name: Wait for Docker Image

- name: Wait for Docker Image
uses: lewagon/wait-on-check-action@v1.3.1
with:
ref: ${{ github.ref }}
check-name: Build And Run Unit Tests
repo-token: ${{ secrets.GITHUB_TOKEN }}

- name: Start Docker Container and preprocess data
run: docker compose -f docker-compose-for-tests-preprocessing.yml up
run: docker compose -f ${{ matrix.preprocessing-docker-compose }} up
env:
SILO_IMAGE: ${{ steps.dockerMetadata.outputs.tags }}

- name: Start Docker Container and run api
run: docker compose -f docker-compose-for-tests-api.yml up -d --wait
env:
SILO_IMAGE: ${{ steps.dockerMetadata.outputs.tags }}
-
name: Run Tests

- name: Run Tests
run: cd endToEndTests && SILO_URL=localhost:8080 npm run test
12 changes: 6 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
# ---------------------------------------------------------------------------

find_package(Boost REQUIRED COMPONENTS system serialization iostreams)
find_package(Poco REQUIRED COMPONENTS Net Util JSON)
find_package(duckdb REQUIRED)
find_package(LibLZMA REQUIRED)
find_package(TBB REQUIRED)
find_package(nlohmann_json REQUIRED)
find_package(Poco REQUIRED COMPONENTS Net Util JSON)
find_package(roaring REQUIRED)
find_package(spdlog REQUIRED)
find_package(vincentlaucsb-csv-parser REQUIRED)
find_package(TBB REQUIRED)
find_package(yaml-cpp REQUIRED)
find_package(zstd REQUIRED)

Expand Down Expand Up @@ -81,12 +81,12 @@ target_link_libraries(
silo
PUBLIC
${Boost_LIBRARIES}
TBB::tbb
${duckdb_LIBRARIES}
nlohmann_json::nlohmann_json
${roaring_LIBRARIES}
${spdlog_LIBRARIES}
${vincentlaucsb-csv-parser_LIBRARIES}
TBB::tbb
${yaml-cpp_LIBRARIES}
nlohmann_json::nlohmann_json
zstd::libzstd_static
)

Expand Down
16 changes: 8 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
FROM alpine:3.17.0 AS dep_builder
FROM alpine:3.18 AS dep_builder

RUN apk update && apk add --no-cache py3-pip \
build-base=0.5-r3 \
cmake=3.24.4-r0 \
linux-headers=5.19.5-r0 \
boost-build=1.79.0-r0 \
libtbb=2021.7.0-r0
cmake=3.26.5-r0 \
linux-headers=6.3-r0 \
boost-build=1.82.0-r0 \
libtbb=2021.9.0-r0

RUN pip install conan==2.0.8
RUN pip install conan==2.0.14

WORKDIR /src
COPY conanfile.py conanprofile.docker ./
Expand All @@ -32,14 +32,14 @@ RUN \
&& cp build/siloApi .


FROM alpine:3.17.0 AS server
FROM alpine:3.18 AS server

WORKDIR /app
COPY docker_default_preprocessing_config.yaml ./default_preprocessing_config.yaml
COPY docker_runtime_config.yaml ./runtime_config.yaml
COPY --from=builder /src/siloApi ./

RUN apk update && apk add libtbb=2021.7.0-r0 curl jq
RUN apk update && apk add libtbb=2021.9.0-r0 curl jq

# call /info, extract "seqeunceCount" from the JSON and assert that the value is not 0. If any of those fails, "exit 1".
HEALTHCHECK --start-period=20s CMD curl --fail --silent localhost:8081/info | jq .sequenceCount | xargs test 0 -ne || exit 1
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile_linter
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ WORKDIR /src
RUN apt update \
&& apt install -y \
cmake=3.22.1-1ubuntu1.22.04.1 \
python3-pip=22.0.2+dfsg-1ubuntu0.3 \
python3-pip=22.0.2+dfsg-1ubuntu0.4 \
software-properties-common=0.99.22.7 \
wget=1.21.2-2ubuntu1 \
gnupg=2.2.27-3ubuntu2.1 \
Expand All @@ -14,7 +14,7 @@ RUN apt update \
&& add-apt-repository 'deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy main' \
&& apt install -y clang-tidy

RUN pip install conan==2.0.8
RUN pip install conan==2.0.11

COPY conanfile.py conanprofile.docker ./
RUN mv conanprofile.docker conanprofile
Expand Down
2 changes: 1 addition & 1 deletion build_with_conan.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def main(args):
parser.add_argument("--clean", action="store_true", help="Clean build directory before building")
parser.add_argument("--release", action="store_true", help="Trigger RELEASE build")
parser.add_argument("--build_without_clang_tidy", action="store_true", help="Build without clang-tidy")
parser.add_argument("--parallel", type=int, default=1, help="Number of parallel jobs")
parser.add_argument("--parallel", type=int, default=16, help="Number of parallel jobs")

args_parsed = parser.parse_args()
main(args_parsed)
20 changes: 14 additions & 6 deletions conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@ class SiloRecipe(ConanFile):

requires = [
"boost/1.82.0",
"duckdb/0.8.1",
"poco/1.12.4",
"hwloc/2.9.3",
"onetbb/2021.9.0",
"nlohmann_json/3.11.2",
"gtest/cci.20210126",
"roaring/1.0.0",
"spdlog/1.11.0",
"vincentlaucsb-csv-parser/2.1.3",
"yaml-cpp/0.7.0",
"zstd/1.5.5",
]
Expand All @@ -23,6 +24,10 @@ class SiloRecipe(ConanFile):

"zstd/*:shared": False,

"duckdb/*:shared": False,
"duckdb/*:with_json": True,
"duckdb/*:with_parquet": True,

"roaring/*:shared": False,

"gtest/*:no_main": True,
Expand All @@ -31,6 +36,8 @@ class SiloRecipe(ConanFile):
"boost/*:zstd": True,
"boost/*:shared": False,

"hwloc/*:shared": False,

"boost/*:without_iostreams": False,
"boost/*:without_serialization": False,
"boost/*:without_system": False,
Expand Down Expand Up @@ -88,15 +95,16 @@ class SiloRecipe(ConanFile):
def generate(self):
deps = CMakeDeps(self)
deps.set_property("boost", "cmake_find_mode", "both")
deps.set_property("onetbb", "cmake_find_mode", "both")
deps.set_property("poco", "cmake_find_mode", "both")
deps.set_property("nlohmann_json", "cmake_find_mode", "both")
deps.set_property("duckdb", "cmake_find_mode", "both")
deps.set_property("fmt", "cmake_find_mode", "both")
deps.set_property("gtest", "cmake_find_mode", "both")
deps.set_property("hwloc", "cmake_find_mode", "both")
deps.set_property("nlohmann_json", "cmake_find_mode", "both")
deps.set_property("onetbb", "cmake_find_mode", "both")
deps.set_property("pcre2", "cmake_find_mode", "both")
deps.set_property("poco", "cmake_find_mode", "both")
deps.set_property("roaring", "cmake_find_mode", "both")
deps.set_property("spdlog", "cmake_find_mode", "both")
deps.set_property("fmt", "cmake_find_mode", "both")
deps.set_property("vincentlaucsb-csv-parser", "cmake_find_mode", "both")
deps.set_property("yaml-cpp", "cmake_find_mode", "both")
deps.set_property("zstd", "cmake_find_mode", "both")
deps.generate()
12 changes: 12 additions & 0 deletions docker-compose-for-tests-preprocessing-from-ndjson.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
version: "3.9"
services:
silo:
image: ${SILO_IMAGE}
volumes:
- ./testBaseData/exampleDatasetAsNdjson:/preprocessing/input
- ./testBaseData/output:/preprocessing/output
- ./testBaseData/exampleDatasetAsNdjson/preprocessing_config.yaml:/app/preprocessing_config.yaml
- ./testBaseData/exampleDatasetAsNdjson/database_config.yaml:/app/database_config.yaml
- ./logs:/app/logs
command:
- "--preprocessing"
File renamed without changes.
46 changes: 23 additions & 23 deletions endToEndTests/test/info.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ describe('The /info endpoint', () => {
.expect(200)
.expect('Content-Type', 'application/json')
.expect(headerToHaveDataVersion)
.expect({ nBitmapsSize: 3898, sequenceCount: 100, totalSize: 60054981 })
.expect({ nBitmapsSize: 3898, sequenceCount: 100, totalSize: 26589432 })
.end(done);
});

Expand All @@ -27,15 +27,15 @@ describe('The /info endpoint', () => {
'bitmapContainerSizeStatistic'
);
expect(returnedInfo.bitmapContainerSizePerGenomeSection.bitmapContainerSizeStatistic).to.deep.equal({
numberOfArrayContainers: 43540,
numberOfArrayContainers: 48524,
numberOfBitsetContainers: 0,
numberOfRunContainers: 83,
numberOfValuesStoredInArrayContainers: 59577,
numberOfRunContainers: 284,
numberOfValuesStoredInArrayContainers: 66620,
numberOfValuesStoredInBitsetContainers: 0,
numberOfValuesStoredInRunContainers: 2354,
totalBitmapSizeArrayContainers: 119154,
numberOfValuesStoredInRunContainers: 2875,
totalBitmapSizeArrayContainers: 133240,
totalBitmapSizeBitsetContainers: 0,
totalBitmapSizeRunContainers: 3170,
totalBitmapSizeRunContainers: 4824,
});

expect(returnedInfo.bitmapContainerSizePerGenomeSection).to.have.property(
Expand All @@ -62,22 +62,22 @@ describe('The /info endpoint', () => {

expect(returnedInfo).to.have.property('bitmapSizePerSymbol');
expect(returnedInfo.bitmapSizePerSymbol).to.deep.equal({
'-': 6003470,
'A': 6112653,
'B': 5980600,
'C': 6064589,
'D': 5980600,
'G': 6067672,
'H': 5980600,
'K': 5980630,
'M': 5980620,
'N': 5980600,
'R': 5980620,
'S': 5980600,
'T': 6125253,
'V': 5980600,
'W': 5980600,
'Y': 5980620,
'-': 2661831,
'A': 2775910,
'B': 2631464,
'C': 2725728,
'D': 2631464,
'G': 2728118,
'H': 2631464,
'K': 2631594,
'M': 2631554,
'N': 2631464,
'R': 2631514,
'S': 2631464,
'T': 2791923,
'V': 2631464,
'W': 2631514,
'Y': 2631494,
});
})
.expect(headerToHaveDataVersion)
Expand Down
13 changes: 7 additions & 6 deletions endToEndTests/test/queries/fastaAligned_multiple.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"query": {
"action": {
"type": "FastaAligned",
"sequenceName": ["testSecondSequence", "S"]
"sequenceName": ["testSecondSequence", "S"],
"orderByFields": ["gisaid_epi_isl"]
},
"filterExpression": {
"type": "IntBetween",
Expand All @@ -18,16 +19,16 @@
"gisaid_epi_isl": "EPI_ISL_1408408",
"testSecondSequence": "ACGT"
},
{
"S": "MFVFLVLLPLVSSQCVNLITRTQ---SYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLDVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLGRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFDEVFNATRFASVYAWNRKRISNCVADYSVLYNFAPFFAFKCYGVSPTKLNDLCFTNVYADSFVIRGNEVSQIAPGQTGNIADYNYKXXXXXXXXXXXXXXNKLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGNKPCNGVAGFNCYFPLRSYGFRPTYGVGHQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEYVNNSYECDIPIGAGICASYQTQTKSHRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLKRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKYFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNHNAQALNTLVKQLSSKFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT*",
"gisaid_epi_isl": "EPI_ISL_1749899",
"testSecondSequence": "AAGN"
},
{

"gisaid_epi_isl": "EPI_ISL_1749892",
"testSecondSequence": "ACGT"
},
{
"S": "MFVFLVLLPLVSSQCVNLITRTQ---SYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLDVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLGRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFDEVFNATRFASVYAWNRKRISNCVADYSVLYNFAPFFAFKCYGVSPTKLNDLCFTNVYADSFVIRGNEVSQIAPGQTGNIADYNYKXXXXXXXXXXXXXXNKLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGNKPCNGVAGFNCYFPLRSYGFRPTYGVGHQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEYVNNSYECDIPIGAGICASYQTQTKSHRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLKRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKYFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNHNAQALNTLVKQLSSKFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT*",
"gisaid_epi_isl": "EPI_ISL_1749899",
"testSecondSequence": "AAGN"
},
{
"S": "MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAI--SGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGV-YHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTYGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIDDTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSHRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPINFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILARLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTHNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT*",
"gisaid_epi_isl": "EPI_ISL_2016901",
Expand Down
29 changes: 15 additions & 14 deletions endToEndTests/test/queries/nOf_2of3_details.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
"testCaseName": "N-Of query requesting 2 of 3 mutations with details action",
"query": {
"action": {
"type": "Details"
"type": "Details",
"orderByFields": ["gisaid_epi_isl"]
},
"filterExpression": {
"type": "N-Of",
Expand All @@ -28,6 +29,19 @@
}
},
"expectedQueryResult": [
{
"aaInsertions": null,
"age": 50,
"country": "Switzerland",
"date": "2020-11-13",
"division": "Solothurn",
"gisaid_epi_isl": "EPI_ISL_1005148",
"insertions": "25701:CCC",
"pango_lineage": "B.1.221",
"qc_value": 0.92,
"region": "Europe",
"unsorted_date": "2020-12-17"
},
{
"aaInsertions": null,
"age": 50,
Expand Down Expand Up @@ -66,19 +80,6 @@
"qc_value": 0.9,
"region": "Europe",
"unsorted_date": "2021-01-22"
},
{
"aaInsertions": null,
"age": 50,
"country": "Switzerland",
"date": "2020-11-13",
"division": "Solothurn",
"gisaid_epi_isl": "EPI_ISL_1005148",
"insertions": "25701:CCC",
"pango_lineage": "B.1.221",
"qc_value": 0.92,
"region": "Europe",
"unsorted_date": "2020-12-17"
}
]
}
Loading

0 comments on commit 14ed389

Please sign in to comment.