Skip to content

Commit

Permalink
improve parsing of names and sources; update XML output from viaf in …
Browse files Browse the repository at this point in the history
…test suite; bump version
  • Loading branch information
codeforkjeff committed Feb 6, 2016
1 parent 0a875dc commit 2144bcf
Show file tree
Hide file tree
Showing 9 changed files with 43,763 additions and 20 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ build the .jar file using maven.
Run this command:

```
java -jar refine_viaf-1.0.jar
java -jar refine_viaf-1.1.jar
```

That's it! You should see some messages as the application starts
Expand Down
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

<groupId>org.refine_viaf</groupId>
<artifactId>refine_viaf</artifactId>
<version>1.0</version>
<version>1.1</version>
<packaging>jar</packaging>

<name>refine_viaf</name>
Expand All @@ -17,7 +17,7 @@
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.3.0.RELEASE</version>
<version>1.3.2.RELEASE</version>
</parent>

<dependencies>
Expand Down
2 changes: 1 addition & 1 deletion run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@
# the JVM from having to dynamically allocate memory, which takes time.
# -Xms128m -Xmx128m

java -Xms128m -Xmx128m -Dlogging.level.com.codefork.refine=DEBUG -jar target/refine_viaf-1.0.jar
java -Xms128m -Xmx128m -Dlogging.level.com.codefork.refine=DEBUG -jar target/refine_viaf-1.1.jar
8 changes: 7 additions & 1 deletion src/main/java/com/codefork/refine/viaf/VIAFParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ public class VIAFParser extends DefaultHandler {
private boolean captureChars = false;
private boolean insideHeadings = false;
private boolean insideSources = false;
private int depth = 0;
private int headingsDepth = -1;

// viaf's weird indexed namespacing
private int nsIndex = 2;
Expand Down Expand Up @@ -52,6 +54,7 @@ public void startElement(String uri, String localName, String qName, Attributes
captureChars = true;
} else if (getElementNameWithNS("mainHeadings").equals(qName)) {
insideHeadings = true;
headingsDepth = depth;
} else if (insideHeadings && getElementNameWithNS("data").equals(qName)) {
getLastResult().addNameEntry();
} else if (insideHeadings && getElementNameWithNS("text").equals(qName)) {
Expand All @@ -60,13 +63,14 @@ public void startElement(String uri, String localName, String qName, Attributes
insideSources = true;
} else if (insideSources && getElementNameWithNS("s").equals(qName)) {
captureChars = true;
} else if (insideHeadings) {
} else if (headingsDepth != -1 && depth == headingsDepth + 1) {
// if we got here, we encountered some other child of mainHeadings
// so we want to effectively end the section, otherwise we'll end up
// erroneously picking up other "text" and "sources" elements nested
// under other elements in mainHeadings
insideHeadings = false;
}
depth ++;
}

@Override
Expand All @@ -83,6 +87,7 @@ public void endElement(String uri, String localName, String qName) throws SAXExc
captureChars = false;
} else if (getElementNameWithNS("mainHeadings").equals(qName)) {
insideHeadings = false;
headingsDepth = -1;
} else if (insideHeadings && getElementNameWithNS("text").equals(qName)) {
getLastResult().getLastNameEntry().setName(buf.toString());
buf = new StringBuilder();
Expand All @@ -94,6 +99,7 @@ public void endElement(String uri, String localName, String qName) throws SAXExc
buf = new StringBuilder();
captureChars = false;
}
depth --;
}

@Override
Expand Down
77 changes: 77 additions & 0 deletions src/test/java/com/codefork/refine/viaf/VIAFParserTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package com.codefork.refine.viaf;

import org.junit.Test;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.InputStream;
import java.util.List;

import static org.junit.Assert.assertEquals;

public class VIAFParserTest {

private static String joinStrings(List<String> strings, String delimiter) {
StringBuilder b = new StringBuilder();
for(String s : strings) {
if(b.length() > 0) {
b.append(delimiter);
}
b.append(s);
}
return b.toString();
}

@Test
public void testParseNames() throws Exception {
SAXParserFactory spf = SAXParserFactory.newInstance();
SAXParser parser = spf.newSAXParser();
VIAFParser viafParser = new VIAFParser();

InputStream is = getClass().getResourceAsStream("/steinbeck_no_type.xml");
parser.parse(is, viafParser);

List<VIAFResult> results = viafParser.getResults();

VIAFResult firstResult = results.get(0);
VIAFResult secondResult = results.get(1);

assertEquals(10, firstResult.getNameEntries().size());

assertEquals("Steinbeck, John, 1902-1968",
firstResult.getNameEntries().get(0).getName());
assertEquals("LC,BIBSYS,BNF,KRNLK,N6I,LAC,BNE,SUDOC,BAV,BNC,NLI,B2Q,PTBNP,NLP,LNB,SELIBR,NLA,ICCU,NDL,DNB,NUKAT,NKC",
joinStrings(firstResult.getNameEntries().get(0).getSources(), ","));

assertEquals("Steinbeck, John (John Ernst), 1902-1968",
firstResult.getNameEntries().get(1).getName());
assertEquals("NTA",
joinStrings(firstResult.getNameEntries().get(1).getSources(), ","));

assertEquals("NSK,SWNL",
joinStrings(firstResult.getNameEntries().get(2).getSources(), ","));
assertEquals("WKP",
joinStrings(firstResult.getNameEntries().get(3).getSources(), ","));
assertEquals("LNL,EGAXA",
joinStrings(firstResult.getNameEntries().get(4).getSources(), ","));
assertEquals("NLI",
joinStrings(firstResult.getNameEntries().get(5).getSources(), ","));
assertEquals("NLI",
joinStrings(firstResult.getNameEntries().get(6).getSources(), ","));
assertEquals("NLI",
joinStrings(firstResult.getNameEntries().get(7).getSources(), ","));
assertEquals("NLR",
joinStrings(firstResult.getNameEntries().get(8).getSources(), ","));
assertEquals("JPG",
joinStrings(firstResult.getNameEntries().get(9).getSources(), ","));

assertEquals(5, secondResult.getNameEntries().size());

assertEquals("Steinbeck, John 1946-1991",
secondResult.getNameEntries().get(0).getName());
assertEquals("NLP,ICCU,DNB,BNF",
joinStrings(secondResult.getNameEntries().get(0).getSources(), ","));

}

}
18 changes: 9 additions & 9 deletions src/test/java/com/codefork/refine/viaf/VIAFTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ public void testSearchNoParticularType() throws Exception {
assertFalse(result1.isMatch());

Result result2 = results.get(1);
assertEquals("Steinbeck, John, 1902-1968. | Of mice and men.", result2.getName());
assertEquals(NameType.Book.asVIAFNameType(), result2.getType().get(0));
assertEquals("180993990", result2.getId());
assertEquals("Steinbeck, John 1946-1991", result2.getName());
assertEquals(NameType.Person.asVIAFNameType(), result2.getType().get(0));
assertEquals("19893647", result2.getId());
assertFalse(result2.isMatch());

Result result3 = results.get(2);
assertEquals("Steinbeck, John 1946-1991", result3.getName());
assertEquals(NameType.Person.asVIAFNameType(), result3.getType().get(0));
assertEquals("19893647", result3.getId());
assertEquals("Steinbeck, John, 1902-1968. | Of mice and men.", result3.getName());
assertEquals(NameType.Book.asVIAFNameType(), result3.getType().get(0));
assertEquals("180993990", result3.getId());
assertFalse(result3.isMatch());
}

Expand All @@ -105,15 +105,15 @@ public void testSearchWithSource() throws Exception {
assertFalse(result1.isMatch());

Result result2 = results.get(1);
assertEquals("Nabokov, Vladimir Vladimirovič | Volšebnik", result2.getName());
assertEquals("Nabokov, Vladimir Vladimirovič | Lolita", result2.getName());
assertEquals(NameType.Book.asVIAFNameType(), result2.getType().get(0));
assertEquals("316638111", result2.getId());
assertEquals("176671347", result2.getId());
assertFalse(result2.isMatch());

Result result3 = results.get(2);
assertEquals("Nabokov, Vladimir Vladimirovič | Govori, sjećanje!", result3.getName());
assertEquals(NameType.Book.asVIAFNameType(), result3.getType().get(0));
assertEquals("140144814502844904157", result3.getId());
assertEquals("183561595", result3.getId());
assertFalse(result3.isMatch());
}

Expand Down
8,676 changes: 8,674 additions & 2 deletions src/test/resources/nabokov_nsk.xml

Large diffs are not rendered by default.

25,467 changes: 25,465 additions & 2 deletions src/test/resources/shakespeare.xml

Large diffs are not rendered by default.

9,529 changes: 9,527 additions & 2 deletions src/test/resources/steinbeck_no_type.xml

Large diffs are not rendered by default.

0 comments on commit 2144bcf

Please sign in to comment.