-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
3,784 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/bin/bash | ||
curl https://raw.githubusercontent.com/cov-lineages/pango-designation/refs/heads/master/pango_designation/alias_key.json > alias_key.json | ||
curl https://raw.githubusercontent.com/cov-lineages/pango-designation/refs/heads/master/lineages.csv > lineages.csv | ||
python3 uniqueValuesFromColumn.py lineages.csv 1 lineage_keys.csv | ||
python3 alias2lineageDefinitions.py alias_key.json lineage_keys.csv > lineage_definitions.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
# Generate Lineage Definitions | ||
|
||
## Overview | ||
|
||
**Generate Lineage Definitions** is a TypeScript-based utility for processing JSON alias keys and CSV lineage files to transform them into a structured format using DuckDB. It reads an alias key in JSON format, a lineage file in CSV format, and outputs a structured YAML file, representing lineages and their relationships (parents and aliases). | ||
|
||
## Installation | ||
|
||
### Prerequisites | ||
|
||
Make sure you have **Node.js** installed. | ||
|
||
### Steps | ||
|
||
1. Clone the repository: | ||
```bash | ||
git clone https://github.com/GenSpectrum/LAPIS-SILO | ||
``` | ||
|
||
2. Navigate to the project directory: | ||
```bash | ||
cd scripts/lineage-definition-generator | ||
``` | ||
|
||
3. Install the required dependencies: | ||
```bash | ||
npm install | ||
``` | ||
|
||
## Usage | ||
|
||
To run the transformation script, use the following command: | ||
|
||
```bash | ||
npm run start -- <aliasKey> <lineageFile> [options] | ||
``` | ||
|
||
### Arguments | ||
|
||
* `<aliasKey>`: The path to the JSON file containing alias key mappings. | ||
* `<lineageFile>`: The path to the CSV file containing all lineages. | ||
|
||
### Options | ||
|
||
* `--preserve-tmp-dir`: Preserve the temporary directory where intermediate DuckDB files are stored. By default, the directory is deleted after execution. | ||
* `--verbose, -v`: Enable verbose logging for more detailed output during the execution. | ||
|
||
Output | ||
The resulting lineage structure is printed in YAML format: | ||
|
||
### Output | ||
|
||
The resulting lineage structure is printed in YAML format: | ||
|
||
|
||
``` | ||
parent1: | ||
aliases: | ||
- some_alias | ||
parent2: {} | ||
lineage1: | ||
parents: | ||
- parent1 | ||
- parent2 | ||
aliases: | ||
- alias1``` | ||
|
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
import * as fs from "fs"; | ||
import csvParser from "csv-parser"; | ||
import * as path from "path"; | ||
import * as duckdb from "duckdb"; | ||
import * as yaml from "js-yaml"; | ||
import { tmpdir } from "os"; | ||
import { Command } from "commander"; | ||
import { RowData } from "duckdb"; | ||
import * as arrow from "apache-arrow"; | ||
|
||
const program = new Command(); | ||
|
||
program | ||
.argument("<aliasKey>", "Path to the alias key in JSON format") | ||
.argument("<lineageFile>", "Path to the input file containing all lineages") | ||
.option( | ||
"--preserve-tmp-dir", | ||
"Preserve the temporary directory to keep the intermediate duckdb tables", | ||
) | ||
.option("--verbose", "Verbose logging") | ||
.parse(process.argv); | ||
|
||
const options = program.opts(); | ||
const aliasKeyPath = program.args[0]; | ||
const lineageFilePath = program.args[1]; | ||
|
||
let aliasTable: any[] = []; | ||
let aliasDict: { [key: string]: string[] } = {}; | ||
|
||
// Read and process the alias key file | ||
const aliasFile = JSON.parse(fs.readFileSync(aliasKeyPath, "utf-8")); | ||
|
||
for (const [key, value] of Object.entries(aliasFile)) { | ||
if (typeof value === "string") { | ||
if (value && value.length > 0) { | ||
aliasTable.push({ name: key, alias: value }); | ||
aliasDict[key] = [value]; | ||
} else { | ||
aliasTable.push({ name: key, alias: key }); | ||
aliasDict[key] = [key]; | ||
} | ||
} else if (Array.isArray(value)) { | ||
aliasDict[key] = value as string[]; | ||
} | ||
} | ||
|
||
function unaliasLineage(lineage: string): string { | ||
const parts = lineage.split("."); | ||
const firstPart = parts[0]; | ||
if (aliasDict[firstPart] && aliasDict[firstPart].length === 1) { | ||
parts[0] = aliasDict[firstPart][0]; | ||
return parts.join("."); | ||
} else { | ||
return lineage; | ||
} | ||
} | ||
|
||
function findImmediateParent(lineage: string): string | null { | ||
if (lineage.includes(".")) { | ||
return lineage.substring(0, lineage.lastIndexOf(".")); | ||
} | ||
return null; | ||
} | ||
|
||
let lineageTable: any[] = []; | ||
let allUnaliased = new Set<string>(); | ||
|
||
// Read and process the lineage CSV file | ||
fs.createReadStream(lineageFilePath) | ||
.pipe(csvParser()) | ||
.on("data", (row: any) => { | ||
const lineage = row[Object.keys(row)[0]]; | ||
const unaliased = unaliasLineage(lineage); | ||
allUnaliased.add(unaliased); | ||
const parentLineageUnaliased = findImmediateParent(unaliased); | ||
lineageTable.push({ | ||
lineage, | ||
unaliased, | ||
parentLineageUnaliased, | ||
}); | ||
}) | ||
.on("end", () => { | ||
// Fill "gaps" in lineage system: e.g. generate BA if it is missing and BA.something is present | ||
let idx = 0; | ||
while (idx < lineageTable.length) { | ||
const unaliased = lineageTable[idx].parentLineageUnaliased; | ||
if (unaliased && !allUnaliased.has(unaliased)) { | ||
allUnaliased.add(unaliased); | ||
const parentLineageUnaliased = findImmediateParent(unaliased); | ||
lineageTable.push({ | ||
lineage: unaliased, | ||
unaliased, | ||
parentLineageUnaliased, | ||
}); | ||
} | ||
idx++; | ||
} | ||
|
||
const tempDirPrefix = path.join(tmpdir(), "silo-lineage-definitions-"); | ||
const tempDir = fs.mkdtempSync(tempDirPrefix); | ||
if (options.verbose) { | ||
console.error(`Temporary directory: ${tempDir}`); | ||
} | ||
|
||
const dbPath = path.join(tempDir, "lineage_transform.duckdb"); | ||
const db = new duckdb.Database(dbPath); | ||
const con = db.connect(); | ||
|
||
db.exec(`INSTALL arrow; LOAD arrow;`, (err) => { | ||
if (err) { | ||
console.warn(err); | ||
return; | ||
} | ||
|
||
const arrowTable2 = arrow.tableFromJSON(lineageTable); | ||
db.register_buffer( | ||
"lineageTable", | ||
[arrow.tableToIPC(arrowTable2)], | ||
true, | ||
(err, res) => { | ||
if (err) { | ||
console.warn(err); | ||
return; | ||
} | ||
|
||
let lineageDict: any = {}; | ||
con.each( | ||
"SELECT lineage FROM lineages ORDER BY lineage", | ||
[], | ||
(err, row: RowData) => { | ||
console.log(row); | ||
lineageDict[row.lineage] = {}; | ||
}, | ||
); | ||
}, | ||
); | ||
}); | ||
}); |
Oops, something went wrong.