Skip to content

Commit

Permalink
amidict submits SPARQL and creates dictionaries
Browse files Browse the repository at this point in the history
  • Loading branch information
petermr committed Aug 6, 2020
1 parent 6257f99 commit ed1f2ec
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 42 deletions.
8 changes: 5 additions & 3 deletions RELEASE-NOTES-NEXT.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
## Summary

This is a template and should be replaced by actual release notes...
Enhancement to Dictionaries

## Changes in this Release
- First Change
- Second Change
- amidict can submit SPARQL queries to Wikidata
- amidict translates wikidataAltLabel to synonyms
- DSL for transforming dictionaries (EXTRACT, DELETE, etc.)

Original file line number Diff line number Diff line change
Expand Up @@ -209,9 +209,18 @@ public class DictionaryCreationTool extends AbstractAMIDictTool {
split=",",
description = "list of terms (entries), space-separated. Requires `inputname` or `dictionary`")
private List<String> terms;

private Set<String> termSet;


@Option(names= {"--transformName"},
split="@",
description="create new attribute name (key) and populate transformed map value. Syntax:"
+ "newAttName@operation(oldAttName,operationValue) where 'operation' is REGEX and operationValue "
+ "is a regex with captures. More operations may be added later (e.g. delete and append)"
)
private Map<String, String> transformationByAmiName = new HashMap<>();


@Option(names = {"--wptype"},
arity="1",
description = "type of input (HTML , mediawiki)")
Expand Down Expand Up @@ -294,16 +303,26 @@ public void runSub() {
for (String templateName : templateNames) {
currentTemplateName = templateName;
input(createDictionaryName(currentTemplateName));
createDictionary();
createAndWriteDictionary();
}

} else {
// single input
createDictionary();
createAndWriteDictionary();
}
printMissingLinks();

}

private void transformValues() {
if (transformationByAmiName.size() > 0) {
for (String amiName : transformationByAmiName.keySet()) {
DictionaryTransformer dictionaryTransformer =
new DictionaryTransformer(amiName, transformationByAmiName.get(amiName));
dictionaryTransformer.transform(simpleDictionary);
}
}
}


private void createFilenamesForWikimediaInput() {
Expand Down Expand Up @@ -352,7 +371,7 @@ private String createDictionaryName(String templateName) {
return templateName.toLowerCase().replaceAll("[^A-Za-z0-9_\\-]", "");
}

private void createDictionary() {
private void createAndWriteDictionary() {
InputStream inputStream = null;
if (input() != null) {
inputStream = getInputStreamFromFile();
Expand Down Expand Up @@ -940,6 +959,7 @@ private static String trimWikipediaUrlBase(String urlValue) {

private void writeDictionary(String dictionary) {
// this is slightly messy -
transformValues();
simpleDictionary.getDictionaryElement().addAttribute(new Attribute(DefaultAMIDictionary.TITLE, dictionary));
File subDirectory = getOrCreateExistingSubdirectory(dictionary);
if (subDirectory != null) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package org.contentmine.ami.tools.dictionary;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import nu.xom.Attribute;
import nu.xom.Element;

/** transform dictionary
*
* @author pm286
*
*/
public class DictionaryTransformer {

private static Pattern TRANSFORM_PATTERN = Pattern.compile("(EXTRACT|JUNK)\\(([^,]+),(.*)\\)");

private String operation;
private String variableName;
private Pattern pattern;
private String newVariableName;

public DictionaryTransformer(String newVariableName, String rawTransform) {
this.newVariableName = newVariableName;
readAndParse(rawTransform);
}

private void readAndParse(String rawTransform) {
// System.out.println(TRANSFORM_PATTERN);
Matcher matcher = TRANSFORM_PATTERN.matcher(rawTransform);
if (!matcher.matches()) {
throw new RuntimeException("bad rawTransform "+rawTransform);
}
operation = matcher.group(1);
variableName = matcher.group(2);
String regexString = matcher.group(3);
pattern = Pattern.compile(regexString);
}

public String getOperation() {
return operation;
}

public String getVariableName() {
return variableName;
}

public Pattern getPattern() {
return pattern;
}

public void transform(SimpleDictionary simpleDictionary) {
List<Element> entryList = simpleDictionary.getEntryList();
for (Element entry : entryList) {
// System.out.println(variableName + ": "+entry.toXML());
String value = entry.getAttributeValue(variableName);
// System.out.println("pattern: "+pattern+" value: "+value);
Matcher matcher = pattern.matcher(value);
if (matcher.matches()) {
String newValue = matcher.group(1);
// System.out.println("g "+newValue);
entry.addAttribute(new Attribute(newVariableName, newValue));
System.out.println(entry.toXML());
}
}
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,8 @@ private String getValueByBindingName(Element sparqlResult, String bindingName) {
String childName = child.getLocalName();
if (childName.contentEquals(DictionaryCreationTool.URI)) {
// last field in URI
value = value.substring(value.lastIndexOf("/") + 1);
// value = value.substring(value.lastIndexOf("/") + 1);
value = value;
} else if (childName.contentEquals(DictionaryCreationTool.LITERAL)) {
// copy direct
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,33 @@ public void testCreateFromWikidataQueryMap() throws IOException {
AbstractAMITest.writeOutputAndCompare(TEST_DICTIONARY, dictionary, outputDir);
}

@Test
public void testCreateFromWikidataQueryMapTransform() throws IOException {
String dictionary = "disease4";
File queryFile = new File(TEST_DICTIONARY, dictionary + ".sparql");
File outputDir = TARGET_DICTIONARY;
String cmd = "-vvv"
+ " --dictionary " + dictionary
+ " --directory=" + outputDir
+ " create"
+ " --informat=wikisparqlxml"
+ " --sparqlquery "+queryFile
+ " --sparqlmap "
+ "wikidataURL=wikidata,"
+ "wikipediaURL=wikipedia,"
+ "description=wikidataDescription,"
+ "wikidataAltLabel=wikidataAltLabel,"
+ "term=wikidataLabel,"
+ "name=wikidataLabel"
+ " --transformName wikidataID=EXTRACT(wikidataURL,.*/(.*))"
+ ""
+ " --synonyms=wikidataAltLabel"
;
AMIDict.execute(cmd);
AbstractAMITest.writeOutputAndCompare(TEST_DICTIONARY, dictionary, outputDir);
}


// CREATE
@Test
/** creates mini dictionary with wikipedia and wikidata links where possible
Expand Down
Original file line number Diff line number Diff line change
@@ -1,35 +1,34 @@
<?xml version="1.0" encoding="UTF-8"?>
<dictionary title="disease4">
<entry description="injury caused by a bite from a snake" name="snakebite" term="snakebite" wikidata="Q68854" wikidataAltLabel="snake bite, snake bites, snake envenomation, snake envenoming" wikipedia="Snakebite">
<synonym>snake bite</synonym>
<synonym>snake bites</synonym>
<synonym>snake envenomation</synonym>
<synonym>snake envenoming</synonym>
</entry>
<entry description="Human disease: infectious thrombophlebitis of the internal jugular vein" name="Lemierre's syndrome" term="Lemierre's syndrome" wikidata="Q72000" wikidataAltLabel="Lemierre syndrome, acute sore throat, human necrobacillosis, postanginal sepsis" wikipedia="Lemierre%27s_syndrome">
<synonym>Lemierre syndrome</synonym>
<synonym>acute sore throat</synonym>
<synonym>human necrobacillosis</synonym>
<synonym>postanginal sepsis</synonym>
</entry>
<entry description="endocarditis that results from the deposition of small sterile vegetations on valve leaflets" name="marantic endocarditis" term="marantic endocarditis" wikidata="Q73518" wikidataAltLabel="non-bacterial thrombotic endocarditis, non-infective endocarditis, nonbacterial thrombotic endocarditis" wikipedia="Nonbacterial_thrombotic_endocarditis">
<synonym>non-bacterial thrombotic endocarditis</synonym>
<synonym>non-infective endocarditis</synonym>
<synonym>nonbacterial thrombotic endocarditis</synonym>
</entry>
<entry description="type of inflammatory bowel disease" name="Crohn's disease" term="Crohn's disease" wikidata="Q1472" wikidataAltLabel="Crohn disease, Crohn, Crohn's disease of colon, Crohn's disease of colon (disorder), Crohn's disease of large bowel, Granulomatous Colitis, Pediatric Crohn's disease, regional colitis, regional enteritis, regional enteritis of small intestine with large intestine, regional enteritis of the large bowel, regional Ileitis, regional ileocolitis" wikipedia="Crohn%27s_disease">
<synonym>Crohn disease</synonym>
<synonym>Crohn</synonym>
<synonym>Crohn's disease of colon</synonym>
<synonym>Crohn's disease of colon (disorder)</synonym>
<synonym>Crohn's disease of large bowel</synonym>
<synonym>Granulomatous Colitis</synonym>
<synonym>Pediatric Crohn's disease</synonym>
<synonym>regional colitis</synonym>
<synonym>regional enteritis</synonym>
<synonym>regional enteritis of small intestine with large intestine</synonym>
<synonym>regional enteritis of the large bowel</synonym>
<synonym>regional Ileitis</synonym>
<synonym>regional ileocolitis</synonym>
</entry>
</dictionary>
<entry description="injury caused by a bite from a snake" name="snakebite" term="snakebite" wikidataAltLabel="snake bite, snake bites, snake envenomation, snake envenoming" wikidataURL="http://www.wikidata.org/entity/Q68854" wikipediaURL="https://en.wikipedia.org/wiki/Snakebite" wikidataID="Q68854">
<synonym>snake bite</synonym>
<synonym>snake bites</synonym>
<synonym>snake envenomation</synonym>
<synonym>snake envenoming</synonym>
</entry>
<entry description="Human disease: infectious thrombophlebitis of the internal jugular vein" name="Lemierre's syndrome" term="Lemierre's syndrome" wikidataAltLabel="Lemierre syndrome, acute sore throat, human necrobacillosis, postanginal sepsis" wikidataURL="http://www.wikidata.org/entity/Q72000" wikipediaURL="https://en.wikipedia.org/wiki/Lemierre%27s_syndrome" wikidataID="Q72000">
<synonym>Lemierre syndrome</synonym>
<synonym>acute sore throat</synonym>
<synonym>human necrobacillosis</synonym>
<synonym>postanginal sepsis</synonym>
</entry>
<entry description="endocarditis that results from the deposition of small sterile vegetations on valve leaflets" name="marantic endocarditis" term="marantic endocarditis" wikidataAltLabel="non-bacterial thrombotic endocarditis, non-infective endocarditis, nonbacterial thrombotic endocarditis" wikidataURL="http://www.wikidata.org/entity/Q73518" wikipediaURL="https://en.wikipedia.org/wiki/Nonbacterial_thrombotic_endocarditis" wikidataID="Q73518">
<synonym>non-bacterial thrombotic endocarditis</synonym>
<synonym>non-infective endocarditis</synonym>
<synonym>nonbacterial thrombotic endocarditis</synonym>
</entry>
<entry description="type of inflammatory bowel disease" name="Crohn's disease" term="Crohn's disease" wikidataAltLabel="Crohn disease, Crohn, Crohn's disease of colon, Crohn's disease of colon (disorder), Crohn's disease of large bowel, Granulomatous Colitis, Pediatric Crohn's disease, regional colitis, regional enteritis, regional enteritis of small intestine with large intestine, regional enteritis of the large bowel, regional Ileitis, regional ileocolitis" wikidataURL="http://www.wikidata.org/entity/Q1472" wikipediaURL="https://en.wikipedia.org/wiki/Crohn%27s_disease" wikidataID="Q1472">
<synonym>Crohn disease</synonym>
<synonym>Crohn</synonym>
<synonym>Crohn's disease of colon</synonym>
<synonym>Crohn's disease of colon (disorder)</synonym>
<synonym>Crohn's disease of large bowel</synonym>
<synonym>Granulomatous Colitis</synonym>
<synonym>Pediatric Crohn's disease</synonym>
<synonym>regional colitis</synonym>
<synonym>regional enteritis</synonym>
<synonym>regional enteritis of small intestine with large intestine</synonym>
<synonym>regional enteritis of the large bowel</synonym>
<synonym>regional Ileitis</synonym>
<synonym>regional ileocolitis</synonym>
</entry>
</dictionary>

0 comments on commit ed1f2ec

Please sign in to comment.