Skip to content

Commit

Permalink
Validation: use proper CSV library to generate .csv #242
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Mar 13, 2023
1 parent 18829e6 commit 023d0e5
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 27 deletions.
7 changes: 7 additions & 0 deletions src/main/java/de/gwdg/metadataqa/marc/CsvUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
import java.io.Serializable;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class CsvUtils {

Expand All @@ -18,6 +21,10 @@ public static String createCsvFromObjects(List<Object> values) {
return createCsv(asArrayFromObject(values));
}

public static String createCsv(Object... values) {
return createCsv(asArrayFromObject(Stream.of(values).collect(Collectors.toList())));
}

public static String createCsv(String[] values) {
String csv = null;

Expand Down
28 changes: 9 additions & 19 deletions src/main/java/de/gwdg/metadataqa/marc/cli/ValidatorCli.java
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,8 @@ public void beforeIteration() {
logger.info("summary output: " + summaryFile.getPath());

collectorFile = prepareReportFile(parameters.getOutputDir(), "issue-collector.csv");
String header = ValidationErrorFormatter.formatHeaderForCollector(
parameters.getFormat()
);
print(collectorFile, header + "\n");
String header = ValidationErrorFormatter.formatHeaderForCollector(parameters.getFormat());
print(collectorFile, header);

} else {
if (parameters.doSummary())
Expand All @@ -139,7 +137,7 @@ public void beforeIteration() {
}
if (parameters.doDetails()) {
String header = ValidationErrorFormatter.formatHeaderForDetails(parameters.getFormat());
print(detailsFile, header + "\n");
print(detailsFile, header);
}

if (parameters.collectAllErrors())
Expand Down Expand Up @@ -348,12 +346,6 @@ private void printSummary(char separator) {
cells.add(error.getId());
cells.addAll(Arrays.asList(ValidationErrorFormatter.asArrayWithoutId(error)));
cells.addAll(Arrays.asList(instanceCount, validatorDAO.getRecordBasedErrorCounter().get(error.getId())));
// String formattedOutput = ValidationErrorFormatter.formatForSummary(
// error, parameters.getFormat()
// );
// print(summaryFile, createRow(
// separator, error.getId(), formattedOutput, instanceCount, validatorDAO.getRecordBasedErrorCounter().get(error.getId())
// ));
// TODO: separator
print(summaryFile, CsvUtils.createCsv(cells));
}
Expand All @@ -363,7 +355,7 @@ private void printSummary(char separator) {
private void printTypeCounts() {
var path = Paths.get(parameters.getOutputDir(), "issue-by-type.csv");
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow("id", "categoryId", "category", "type", "instances", "records"));
writer.write(CsvUtils.createCsv("id", "categoryId", "category", "type", "instances", "records"));
validatorDAO.getTypeRecordCounter()
.entrySet()
.stream()
Expand All @@ -373,9 +365,7 @@ private void printTypeCounts() {
int records = entry.getValue();
int instances = validatorDAO.getTypeInstanceCounter().get(entry.getKey());
try {
writer.write(createRow(
type.getId(), type.getCategory().getId(), type.getCategory().getName(), quote(type.getMessage()), instances, records
));
writer.write(CsvUtils.createCsv(type.getId(), type.getCategory().getId(), type.getCategory().getName(), type.getMessage(), instances, records));
} catch (IOException e) {
logger.log(Level.SEVERE, "printTypeCounts", e);
}
Expand All @@ -388,15 +378,15 @@ private void printTypeCounts() {
private void printTotalCounts() {
var path = Paths.get(parameters.getOutputDir(), "issue-total.csv");
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow("type", "instances", "records"));
writer.write(CsvUtils.createCsv("type", "instances", "records"));
validatorDAO.getTotalRecordCounter()
.entrySet()
.stream()
.forEach(entry -> {
int records = entry.getValue();
int instances = validatorDAO.getTotalInstanceCounter().getOrDefault(entry.getKey(), 0);
try {
writer.write(createRow(entry.getKey(), instances, records));
writer.write(CsvUtils.createCsv(entry.getKey(), instances, records));
} catch (IOException e) {
logger.log(Level.SEVERE, "printTotalCounts", e);
}
Expand All @@ -409,7 +399,7 @@ private void printTotalCounts() {
private void printCategoryCounts() {
var path = Paths.get(parameters.getOutputDir(), "issue-by-category.csv");
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow("id", "category", "instances", "records"));
writer.write(CsvUtils.createCsv("id", "category", "instances", "records"));
validatorDAO.getCategoryRecordCounter()
.entrySet()
.stream()
Expand All @@ -419,7 +409,7 @@ private void printCategoryCounts() {
int records = entry.getValue();
int instances = validatorDAO.getCategoryInstanceCounter().getOrDefault(entry.getKey(), -1);
try {
writer.write(createRow(category.getId(), category.getName(), instances, records));
writer.write(CsvUtils.createCsv(category.getId(), category.getName(), instances, records));
} catch (IOException e) {
logger.log(Level.SEVERE, "printCategoryCounts", e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ public static String formatForSummary(ValidationError error, ValidationErrorForm
}

private static String createCvsRow(String[] strings, char separator) {
return CsvUtils.createCsv(strings);
/*
StringWriter stringWriter = new StringWriter();
CSVWriter csvWriter = new CSVWriter(stringWriter, separator, '"',
CSVWriter.DEFAULT_ESCAPE_CHARACTER, CSVWriter.DEFAULT_LINE_END);
Expand All @@ -164,6 +166,7 @@ private static String createCvsRow(String[] strings, char separator) {
row = row.replace("\\", "\\\\");
}
return row;
*/
}

private static String formatTextWithoutId(ValidationError error) {
Expand Down
60 changes: 56 additions & 4 deletions src/test/java/de/gwdg/metadataqa/marc/cli/ValidatorCliTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,16 @@ public void validate_pica_normal() throws Exception {

} else if (outputFile.equals("issue-by-type.csv")) {
assertEquals(5, lines.size());
assertEquals("8,3,data field,\"repetition of non-repeatable field\",1,1", lines.get(1).trim());
assertEquals("9,3,data field,\"undefined field\",21,10", lines.get(2).trim());
assertEquals("8,3,data field,repetition of non-repeatable field,1,1", lines.get(1).trim());
assertEquals("9,3,data field,undefined field,21,10", lines.get(2).trim());

} else if (outputFile.equals("issue-collector.csv")) {
assertEquals(59, lines.size());
assertEquals("1,010000151;010000011;010000054;010000070;010000194;01000002X;010000127;010000038;010000178;010000089", lines.get(1).trim());
assertEquals("2,010000151;010000011;010000054;010000070;010000194;01000002X;010000127;010000038;010000178;010000089", lines.get(2).trim());
assertEquals("3,010000011", lines.get(3).trim());
assertEquals("4,010000011;01000002X;010000038", lines.get(4).trim());

} else if (outputFile.equals("issue-total.csv")) {
assertEquals(3, lines.size());
assertEquals("1,179,10", lines.get(1).trim());
Expand Down Expand Up @@ -150,6 +151,7 @@ public void validate_pica_groupBy() throws Exception {
List<String> lines = FileUtils.readLinesFromFile("src/test/resources/output/" + outputFile);
if (outputFile.equals("issue-details.csv")) {
assertEquals(11, lines.size());
assertEquals("recordId,errors", lines.get(0).trim());
assertEquals("010000011,1:1;2:1;3:1;4:1;5:1;6:1;7:1;8:1;9:1;10:1;11:1;12:1;13:1;14:1;15:1;16:1;17:1;18:1;19:1;20:1;21:2;22:2;23:1;24:1", lines.get(1).trim());
assertEquals("01000002X,1:1;2:1;4:1;21:1;5:1;22:1;6:1;23:1;7:1;24:1;8:1;25:1;26:2", lines.get(2).trim());
assertEquals("010000038,1:1;2:1;4:1;5:1;6:1;7:1;8:1;21:2;22:2;23:1;24:1;25:1;26:1;27:1;28:2", lines.get(3).trim());
Expand All @@ -171,24 +173,74 @@ public void validate_pica_groupBy() throws Exception {

} else if (outputFile.equals("issue-by-category.csv")) {
assertEquals(3, lines.size());
assertEquals("id,category,instances,records", lines.get(0).trim());
assertEquals("3,data field,22,10", lines.get(1).trim());

} else if (outputFile.equals("issue-by-type.csv")) {
assertEquals(5, lines.size());
assertEquals("8,3,data field,\"repetition of non-repeatable field\",1,1", lines.get(1).trim());
assertEquals("9,3,data field,\"undefined field\",21,10", lines.get(2).trim());
assertEquals("id,categoryId,category,type,instances,records", lines.get(0).trim());
assertEquals("8,3,data field,repetition of non-repeatable field,1,1", lines.get(1).trim());
assertEquals("9,3,data field,undefined field,21,10", lines.get(2).trim());

} else if (outputFile.equals("issue-collector.csv")) {
assertEquals(59, lines.size());
assertEquals("errorId,recordIds", lines.get(0).trim());
assertEquals("1,010000151;010000011;010000054;010000070;010000194;01000002X;010000127;010000038;010000178;010000089", lines.get(1).trim());
assertEquals("2,010000151;010000011;010000054;010000070;010000194;01000002X;010000127;010000038;010000178;010000089", lines.get(2).trim());
assertEquals("3,010000011", lines.get(3).trim());
assertEquals("4,010000011;01000002X;010000038", lines.get(4).trim());

} else if (outputFile.equals("issue-total.csv")) {
assertEquals(3, lines.size());
assertEquals("type,instances,records", lines.get(0).trim());
assertEquals("1,179,10", lines.get(1).trim());
assertEquals("2,158,9", lines.get(2).trim());

} else if (outputFile.equals("count.csv")) {
assertEquals(2, lines.size());
assertEquals("total", lines.get(0).trim());
assertEquals("10", lines.get(1).trim());

} else if (outputFile.equals("validation.params.json")) {
assertEquals(1, lines.size());
assertTrue(lines.get(0).contains("\"args\":[\"/home/kiru/git/metadata-qa-marc/src/test/resources/pica/pica-with-holdings-info.dat\"]"));
assertTrue(lines.get(0).contains("\"marcVersion\":\"MARC21\","));
assertTrue(lines.get(0).contains("\"marcFormat\":\"PICA_NORMALIZED\","));
assertTrue(lines.get(0).contains("\"dataSource\":\"FILE\","));
assertTrue(lines.get(0).contains("\"limit\":-1,"));
assertTrue(lines.get(0).contains("\"offset\":-1,"));
assertTrue(lines.get(0).contains("\"id\":null,"));
assertTrue(lines.get(0).contains("\"defaultRecordType\":\"BOOKS\","));
assertTrue(lines.get(0).contains("\"alephseq\":false,"));
assertTrue(lines.get(0).contains("\"marcxml\":false,"));
assertTrue(lines.get(0).contains("\"lineSeparated\":false,"));
assertTrue(lines.get(0).contains("\"trimId\":true,"));
assertTrue(lines.get(0).contains("\"outputDir\":\"/home/kiru/git/metadata-qa-marc/src/test/resources/output\","));
assertTrue(lines.get(0).contains("\"recordIgnorator\":{\"criteria\":[],\"booleanCriteria\":null,\"empty\":true},"));
assertTrue(lines.get(0).contains("\"recordFilter\":{\"criteria\":[],\"booleanCriteria\":null,\"empty\":true},"));
assertTrue(lines.get(0).contains("\"ignorableFields\":{\"fields\":null,\"empty\":true},"));
assertTrue(lines.get(0).contains("\"stream\":null,"));
assertTrue(lines.get(0).contains("\"defaultEncoding\":null,"));
assertTrue(lines.get(0).contains("\"alephseqLineType\":null,"));
assertTrue(lines.get(0).contains("\"picaIdField\":\"003@$0\","));
assertTrue(lines.get(0).contains("\"picaSubfieldSeparator\":\"$\","));
assertTrue(lines.get(0).contains("\"picaSchemaFile\":null,"));
assertTrue(lines.get(0).contains("\"picaRecordTypeField\":\"002@$0\","));
assertTrue(lines.get(0).contains("\"schemaType\":\"PICA\","));
assertTrue(lines.get(0).contains("\"groupBy\":\"001@$0\","));
assertTrue(lines.get(0).contains("\"groupListFile\":null,"));
assertTrue(lines.get(0).contains("\"detailsFileName\":\"issue-details.csv\","));
assertTrue(lines.get(0).contains("\"summaryFileName\":\"issue-summary.csv\","));
assertTrue(lines.get(0).contains("\"format\":\"COMMA_SEPARATED\","));
assertTrue(lines.get(0).contains("\"ignorableIssueTypes\":null,"));
assertTrue(lines.get(0).contains("\"pica\":true,"));
assertTrue(lines.get(0).contains("\"replacementInControlFields\":null,"));
assertTrue(lines.get(0).contains("\"marc21\":false,"));
assertTrue(lines.get(0).contains("\"mqaf.version\":\"0.9.0\","));
assertTrue(lines.get(0).contains("\"qa-catalogue.version\":\"0.7.0-SNAPSHOT\"}"));

} else {
assertTrue("Unhandlet output: " + outputFile, outputFile.equals(""));
}

output.delete();
Expand Down
8 changes: 4 additions & 4 deletions src/test/java/de/gwdg/metadataqa/marc/dao/LeaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -289,15 +289,15 @@ public void testBadLeaderWithDefaultType() {
assertFalse(errors.isEmpty());
assertEquals(2, errors.size());
assertEquals(
"\"Leader\"\t\"1\"\t\"1\"\t\"undetectable type\"\t\"Leader/06 (typeOfRecord): 'n', Leader/07 (bibliographicLevel): 'm'\"\t\"https://www.loc.gov/marc/bibliographic/bdleader.html\"",
",Leader,1,1,undetectable type,\"Leader/06 (typeOfRecord): 'n', Leader/07 (bibliographicLevel): 'm'\",https://www.loc.gov/marc/bibliographic/bdleader.html\n",
ValidationErrorFormatter.format(
errors.get(0), ValidationErrorFormat.TAB_SEPARATED
errors.get(0), ValidationErrorFormat.COMMA_SEPARATED
)
);
assertEquals(
"\"Leader/06 (leader06)\"\t\"2\"\t\"6\"\t\"invalid value\"\t\"n\"\t\"https://www.loc.gov/marc/bibliographic/bdleader.html\"",
",Leader/06 (leader06),2,6,invalid value,n,https://www.loc.gov/marc/bibliographic/bdleader.html\n",
ValidationErrorFormatter.format(
errors.get(1), ValidationErrorFormat.TAB_SEPARATED
errors.get(1), ValidationErrorFormat.COMMA_SEPARATED
)
);
}
Expand Down

0 comments on commit 023d0e5

Please sign in to comment.