Skip to content

Commit

Permalink
fix hgnc id pollution caused by mondo in ols
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesamcl committed Sep 11, 2024
1 parent ac872aa commit 2fd35ab
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 41 deletions.
85 changes: 46 additions & 39 deletions 01_ingest/grebi_ingest_ols/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,51 +172,58 @@ fn read_entities(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_n
}
}

//if grebitype.eq("ols:Property") {
// Skip entities with unprefixed IDs to avoid polluting ID space, e.g.
// https://www.ebi.ac.uk/ols4/api/v2/ontologies/mondo/classes/http%253A%252F%252Fidentifiers.org%252Fhgnc%252F4044
// TODO: fix this in OLS? print the nodes with their IRI but no
//
if obj.contains_key("ols:curie") {
if get_string_values(obj.get("ols:curie").unwrap()).iter().next().unwrap().contains(":") {
continue;
}
}
if obj.contains_key("ols:shortForm") {
if get_string_values(obj.get("ols:shortForm").unwrap()).iter().next().unwrap().contains("_") {
continue;
}
}

let qualified_safe_label = {
let curie = get_string_values(obj.get("ols:curie").unwrap()).iter().next().unwrap().to_string();
let pref_prefix = {
if curie.contains(":") {
Some(curie.split(":").next().unwrap().to_ascii_lowercase())
let qualified_safe_label = {
let curie = get_string_values(obj.get("ols:curie").unwrap()).iter().next().unwrap().to_string();
let pref_prefix = {
if curie.contains(":") {
Some(curie.split(":").next().unwrap().to_ascii_lowercase())
} else {
let definedBy = obj.get("ols:definedBy");
if definedBy.is_some() {
Some(get_string_values(definedBy.unwrap()).iter().next().unwrap().to_string())
} else {
let definedBy = obj.get("ols:definedBy");
if definedBy.is_some() {
Some(get_string_values(definedBy.unwrap()).iter().next().unwrap().to_string())
} else {
None
}
None
}
};
if !pref_prefix.is_some() {
curie.to_string()
} else {
let pref_prefix_u = pref_prefix.unwrap().to_string();
let label = get_string_values(obj.get("ols:label").unwrap()).iter().next().unwrap().to_string();

// this might not be a real label, in which case just return the curie
if label.starts_with(&(pref_prefix_u.to_owned() + ":")) || label.starts_with(&(pref_prefix_u.to_owned() + "_")) {
curie.to_string()
} else {
pref_prefix_u.to_string() + ":" + &label.to_string().as_bytes().iter().map(|x| {
if x.is_ascii_alphanumeric() {
*x as char
} else {
'_'
}
}).collect::<String>()
}
}
}
};
if !pref_prefix.is_some() {
obj.get("ols:iri").unwrap().as_str().unwrap().to_string()
} else {
let pref_prefix_u = pref_prefix.unwrap().to_string();
let label = get_string_values(obj.get("ols:label").unwrap()).iter().next().unwrap().to_string();

output_nodes.write_all(r#"{"id":"#.as_bytes()).unwrap();
output_nodes.write_all(Value::String(qualified_safe_label).to_string().as_bytes()).unwrap();
/*} else {
output_nodes.write_all(r#"{"id":"#.as_bytes()).unwrap();
let curie = get_string_values(obj.get("ols:curie").unwrap()).iter().next().unwrap().to_string();
output_nodes.write_all(Value::String(curie).to_string().as_bytes()).unwrap();
}*/
// this might not be a real label, in which case just return the curie
if label.starts_with(&(pref_prefix_u.to_owned() + ":")) || label.starts_with(&(pref_prefix_u.to_owned() + "_")) {
curie.to_string()
} else {
pref_prefix_u.to_string() + ":" + &label.to_string().as_bytes().iter().map(|x| {
if x.is_ascii_alphanumeric() {
*x as char
} else {
'_'
}
}).collect::<String>()
}
}
};

output_nodes.write_all(r#"{"id":"#.as_bytes()).unwrap();
output_nodes.write_all(Value::String(qualified_safe_label).to_string().as_bytes()).unwrap();
output_nodes.write_all(r#","grebi:datasource":""#.as_bytes()).unwrap();
output_nodes.write_all(datasource.as_bytes()).unwrap();
output_nodes.write_all(r#"","grebi:type":[""#.as_bytes()).unwrap();
Expand Down
22 changes: 20 additions & 2 deletions 02_assign_ids/grebi_extract_identifiers/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ fn main() {
} else {
wrote_any = true;
}
writer.write_all(&json.string()).unwrap();
let id = json.string();
check_id(&k, &id);
writer.write_all(&id).unwrap();
} else {
json.value(); // skip
}
Expand All @@ -88,7 +90,9 @@ fn main() {
} else {
wrote_any = true;
}
writer.write_all(&json.string()).unwrap();
let id = json.string();
check_id(&k, &id);
writer.write_all(&id).unwrap();
} else {
json.value(); // skip
}
Expand All @@ -110,5 +114,19 @@ fn main() {

}

fn check_id(k:&[u8], id:&[u8]) {
let mut has_non_numeric = false;
for c in id {
if !c.is_ascii_digit() {
has_non_numeric = true;
break;
}
}
if !has_non_numeric {
panic!("Found unprefixed numeric ID {} for identifier property {}. Unqualified numbers like this as identifiers are ambiguous and may cause incorrect equivalences.", String::from_utf8_lossy(id), String::from_utf8_lossy(k));
}
}




4 changes: 4 additions & 0 deletions configs/datasource_configs/hgnc.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
{ "name": "--json-inject-key-prefix", "value": "hgnc:" },
{ "name": "--json-inject-value-prefix", "value": "uniprot_ids:uniprot:" },
{ "name": "--json-inject-value-prefix", "value": "omim_id:omim:" },
{ "name": "--json-inject-value-prefix", "value": "ena:ena:" },
{ "name": "--json-inject-value-prefix", "value": "vega_id:vega:" },
{ "name": "--json-inject-value-prefix", "value": "ccds_id:ccds:" },
{ "name": "--json-inject-value-prefix", "value": "entrez_id:entrez:" },
{ "name": "--json-inject-value-prefix", "value": "pubmed_id:pmid:" }
]
}
Expand Down

0 comments on commit 2fd35ab

Please sign in to comment.