Skip to content

Commit

Permalink
tmp
Browse files Browse the repository at this point in the history
  • Loading branch information
Taepper committed Oct 9, 2024
1 parent b9a24c7 commit 8222885
Show file tree
Hide file tree
Showing 7 changed files with 3,784 additions and 32 deletions.
5 changes: 5 additions & 0 deletions scripts/generate_new_lineage_definitions.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
curl https://raw.githubusercontent.com/cov-lineages/pango-designation/refs/heads/master/pango_designation/alias_key.json > alias_key.json
curl https://raw.githubusercontent.com/cov-lineages/pango-designation/refs/heads/master/lineages.csv > lineages.csv
python3 uniqueValuesFromColumn.py lineages.csv 1 lineage_keys.csv
python3 alias2lineageDefinitions.py alias_key.json lineage_keys.csv > lineage_definitions.yaml
67 changes: 67 additions & 0 deletions scripts/lineage-definition-generator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Generate Lineage Definitions

## Overview

**Generate Lineage Definitions** is a TypeScript-based utility for processing JSON alias keys and CSV lineage files to transform them into a structured format using DuckDB. It reads an alias key in JSON format, a lineage file in CSV format, and outputs a structured YAML file, representing lineages and their relationships (parents and aliases).

## Installation

### Prerequisites

Make sure you have **Node.js** installed.

### Steps

1. Clone the repository:
```bash
git clone https://github.com/GenSpectrum/LAPIS-SILO
```

2. Navigate to the project directory:
```bash
cd scripts/lineage-definition-generator
```

3. Install the required dependencies:
```bash
npm install
```

## Usage

To run the transformation script, use the following command:

```bash
npm run start -- <aliasKey> <lineageFile> [options]
```

### Arguments

* `<aliasKey>`: The path to the JSON file containing alias key mappings.
* `<lineageFile>`: The path to the CSV file containing all lineages.

### Options

* `--preserve-tmp-dir`: Preserve the temporary directory where intermediate DuckDB files are stored. By default, the directory is deleted after execution.
* `--verbose, -v`: Enable verbose logging for more detailed output during the execution.

Output
The resulting lineage structure is printed in YAML format:

### Output

The resulting lineage structure is printed in YAML format:


```
parent1:
aliases:
- some_alias
parent2: {}
lineage1:
parents:
- parent1
- parent2
aliases:
- alias1```

File renamed without changes.
138 changes: 138 additions & 0 deletions scripts/lineage-definition-generator/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import * as fs from "fs";
import csvParser from "csv-parser";
import * as path from "path";
import * as duckdb from "duckdb";
import * as yaml from "js-yaml";
import { tmpdir } from "os";
import { Command } from "commander";
import { RowData } from "duckdb";
import * as arrow from "apache-arrow";

const program = new Command();

program
.argument("<aliasKey>", "Path to the alias key in JSON format")
.argument("<lineageFile>", "Path to the input file containing all lineages")
.option(
"--preserve-tmp-dir",
"Preserve the temporary directory to keep the intermediate duckdb tables",
)
.option("--verbose", "Verbose logging")
.parse(process.argv);

const options = program.opts();
const aliasKeyPath = program.args[0];
const lineageFilePath = program.args[1];

let aliasTable: any[] = [];
let aliasDict: { [key: string]: string[] } = {};

// Read and process the alias key file
const aliasFile = JSON.parse(fs.readFileSync(aliasKeyPath, "utf-8"));

for (const [key, value] of Object.entries(aliasFile)) {
if (typeof value === "string") {
if (value && value.length > 0) {
aliasTable.push({ name: key, alias: value });
aliasDict[key] = [value];
} else {
aliasTable.push({ name: key, alias: key });
aliasDict[key] = [key];
}
} else if (Array.isArray(value)) {
aliasDict[key] = value as string[];
}
}

function unaliasLineage(lineage: string): string {
const parts = lineage.split(".");
const firstPart = parts[0];
if (aliasDict[firstPart] && aliasDict[firstPart].length === 1) {
parts[0] = aliasDict[firstPart][0];
return parts.join(".");
} else {
return lineage;
}
}

function findImmediateParent(lineage: string): string | null {
if (lineage.includes(".")) {
return lineage.substring(0, lineage.lastIndexOf("."));
}
return null;
}

let lineageTable: any[] = [];
let allUnaliased = new Set<string>();

// Read and process the lineage CSV file
fs.createReadStream(lineageFilePath)
.pipe(csvParser())
.on("data", (row: any) => {
const lineage = row[Object.keys(row)[0]];
const unaliased = unaliasLineage(lineage);
allUnaliased.add(unaliased);
const parentLineageUnaliased = findImmediateParent(unaliased);
lineageTable.push({
lineage,
unaliased,
parentLineageUnaliased,
});
})
.on("end", () => {
// Fill "gaps" in lineage system: e.g. generate BA if it is missing and BA.something is present
let idx = 0;
while (idx < lineageTable.length) {
const unaliased = lineageTable[idx].parentLineageUnaliased;
if (unaliased && !allUnaliased.has(unaliased)) {
allUnaliased.add(unaliased);
const parentLineageUnaliased = findImmediateParent(unaliased);
lineageTable.push({
lineage: unaliased,
unaliased,
parentLineageUnaliased,
});
}
idx++;
}

const tempDirPrefix = path.join(tmpdir(), "silo-lineage-definitions-");
const tempDir = fs.mkdtempSync(tempDirPrefix);
if (options.verbose) {
console.error(`Temporary directory: ${tempDir}`);
}

const dbPath = path.join(tempDir, "lineage_transform.duckdb");
const db = new duckdb.Database(dbPath);
const con = db.connect();

db.exec(`INSTALL arrow; LOAD arrow;`, (err) => {
if (err) {
console.warn(err);
return;
}

const arrowTable2 = arrow.tableFromJSON(lineageTable);
db.register_buffer(
"lineageTable",
[arrow.tableToIPC(arrowTable2)],
true,
(err, res) => {
if (err) {
console.warn(err);
return;
}

let lineageDict: any = {};
con.each(
"SELECT lineage FROM lineages ORDER BY lineage",
[],
(err, row: RowData) => {
console.log(row);
lineageDict[row.lineage] = {};
},
);
},
);
});
});
Loading

0 comments on commit 8222885

Please sign in to comment.