Skip to content

Commit

Permalink
Rewrite validators to enable omitting of invalid data in output
Browse files Browse the repository at this point in the history
  • Loading branch information
zstojanovic committed Oct 24, 2017
1 parent 4d213cd commit 7d2abc7
Show file tree
Hide file tree
Showing 14 changed files with 449 additions and 284 deletions.
60 changes: 21 additions & 39 deletions src/main/java/org/wikivoyage/listings/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,8 @@
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
Expand Down Expand Up @@ -38,8 +34,7 @@
import org.wikivoyage.listings.validators.LongitudeValidator;
import org.wikivoyage.listings.validators.Validator;
import org.wikivoyage.listings.validators.WebsiteURLValidator;
import org.wikivoyage.listings.validators.BulkValidator;
import org.wikivoyage.listings.validators.WikidataBulkValidator;
import org.wikivoyage.listings.validators.WikidataValidator;

public class Main {
private static final Log log = LogFactory.getLog(Main.class);
Expand Down Expand Up @@ -185,39 +180,10 @@ private static void processDump(
new JavaSerializedObject().write(listings, javaSerialFile, dumpDate);
listings = new JavaSerializedIterable(javaSerialFile);
}

// Create a list containing only valid listings.
Validator [] validators = {
new LatitudeValidator(),
new LongitudeValidator(),
new WebsiteURLValidator(),
new EmailValidator()
};
BulkValidator bulkValidator = new WikidataBulkValidator();
List<Listing> validListings = new ArrayList<Listing>();
for (Listing listing : listings) {
boolean valid = true;
for (Validator validator : validators) {
valid &= (validator.validate(listing) == null); // validator returns null if valid.
}
if (valid) {
validListings.add(listing);
}
bulkValidator.add(listing);
}
// Remove all POIs which are not valid according to WikidataBulkValidator
for (Listing invalidPoi : bulkValidator.validate().keySet()) {
validListings.remove(invalidPoi);
}

// Write all listings (including invalid ones) to validation output.
writeFormat(listings, language, dumpDate, latestDumpDate, new ValidationReport());

// Write valid listings to the other output formats.
HashMap<String, OutputFormat> formatsForValidListings = new HashMap<>(formats);
formatsForValidListings.remove("validation-report");
for (OutputFormat format: formatsForValidListings.values()) {
writeFormat(validListings, language, dumpDate, latestDumpDate, format);
listings = validate(listings);
// Write listings to all the output formats.
for (OutputFormat format: formats.values()) {
writeFormat(listings, language, dumpDate, latestDumpDate, format);
}
}

Expand Down Expand Up @@ -264,7 +230,23 @@ private static void generateFileForFormat(
) throws WriteOutputException, DumpReadException {
log.info("Parse dump");
Iterable<Listing> listingIterable = new ListingsIterable(inputFilename);
listingIterable = validate(listingIterable);
log.info("Save to '" + outputFilename + "'");
format.write(listingIterable, outputFilename, dumpDate);
}

private static Iterable<Listing> validate(Iterable<Listing> listingIterable) {
Iterable<Listing> validatedIterable = listingIterable;
Validator [] validators = {
new LatitudeValidator(),
new LongitudeValidator(),
new WebsiteURLValidator(),
new EmailValidator(),
new WikidataValidator()
};
for (Validator validator : validators) {
validatedIterable = validator.validate(validatedIterable);
}
return validatedIterable;
}
}
111 changes: 111 additions & 0 deletions src/main/java/org/wikivoyage/listings/entity/Listing.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package org.wikivoyage.listings.entity;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;

import org.wikivoyage.listings.validators.ValidationIssue;

/**
* Entity representing a Wikivoyage listing.
Expand Down Expand Up @@ -140,6 +144,11 @@ public class Listing implements Serializable {
* 2-character language code
*/
protected String language;

/**
* List of ValidationIssues found with this POI during validation
*/
protected List<ValidationIssue> validationIssues = new ArrayList<>();

public Listing(
String article, String type, String title, String alt, String wikidata, String wikipedia, String address, String directions,
Expand Down Expand Up @@ -191,7 +200,25 @@ public String getAlt() {
return alt;
}

/**
* Get wikidata.
*
* @return wikidata if its valid, otherwise null
*/
public String getWikidata() {
if (validationIssues.contains(ValidationIssue.INVALID_WIKIDATA_QID) || validationIssues.contains(ValidationIssue.REDIRECT_WIKIDATA_QID)) {
return null;
} else {
return wikidata;
}
}

/**
* Get raw wikidata for purpose of validation report.
*
* @return unvalidated wikidata
*/
public String rawWikidata() {
return wikidata;
}

Expand All @@ -215,15 +242,51 @@ public String getTollFree() {
return tollFree;
}

/**
* Get email.
*
* @return email if its valid, otherwise null
*/
public String getEmail() {
if (validationIssues.contains(ValidationIssue.INVALID_EMAIL)) {
return null;
} else {
return email;
}
}

/**
* Get raw email for purpose of validation report.
*
* @return unvalidated email
*/
public String rawEmail() {
return email;
}

public String getFax() {
return fax;
}

/**
* Get URL.
*
* @return URL if its valid, otherwise null
*/
public String getUrl() {
if (validationIssues.contains(ValidationIssue.INVALID_URL)) {
return null;
} else {
return url;
}
}

/**
* Get raw URL for purpose of validation report.
*
* @return unvalidated URL
*/
public String rawUrl() {
return url;
}

Expand All @@ -247,11 +310,47 @@ public String getPrice() {
return price;
}

/**
* Get latitude.
*
* @return latitude if its valid, otherwise null
*/
public String getLatitude() {
if (validationIssues.contains(ValidationIssue.INVALID_LATITUDE)) {
return null;
} else {
return latitude;
}
}

/**
* Get raw latitude for purpose of validation report.
*
* @return unvalidated latitude
*/
public String rawLatitude() {
return latitude;
}

/**
* Get longitude.
*
* @return longitude if its valid, otherwise null
*/
public String getLongitude() {
if (validationIssues.contains(ValidationIssue.INVALID_LONGITUDE)) {
return null;
} else {
return longitude;
}
}

/**
* Get raw longitude for purpose of validation report.
*
* @return unvalidated longitude
*/
public String rawLongitude() {
return longitude;
}

Expand Down Expand Up @@ -280,4 +379,16 @@ public boolean isPositionalDataEmpty()
return latitude == null || longitude == null ||
latitude.equals("") || longitude.equals("");
}

public void add(ValidationIssue issue) {
validationIssues.add(issue);
}

public List<ValidationIssue> getValidationIssues() {
return validationIssues;
}

public boolean isValid() {
return validationIssues.isEmpty();
}
}
22 changes: 2 additions & 20 deletions src/main/java/org/wikivoyage/listings/output/ValidationReport.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,19 @@
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Map;
import java.util.Map.Entry;

/**
* Generate an HTML report showing what syntax errors exist in the Wikivoyage data.
*/
public class ValidationReport implements OutputFormat {
@Override
public void write(Iterable<Listing> pois, String outputFilename, String dumpDate) throws WriteOutputException {
Validator [] validators = {
new LatitudeValidator(),
new LongitudeValidator(),
new WebsiteURLValidator(),
new EmailValidator()
};
BulkValidator bulkValidator = new WikidataBulkValidator();

try {
StringBuilder rows = new StringBuilder();
for (Listing poi: pois) {
for (Validator validator: validators) {
String errorMessage = validator.validate(poi);
if (errorMessage != null) {
rows.append(createRow(poi, errorMessage, validator.getIssueType()));
}
for (ValidationIssue issue : poi.getValidationIssues()) {
rows.append(createRow(poi, issue.getDescription(poi), issue.getCategory()));
}
bulkValidator.add(poi);
}
Map<Listing, String> bulkValidationResults = bulkValidator.validate();
for (Entry<Listing, String> entry : bulkValidationResults.entrySet()) {
rows.append(createRow(entry.getKey(), entry.getValue(), bulkValidator.getIssueType()));
}

String template = IOUtils.toString(
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,13 @@

import org.wikivoyage.listings.entity.Listing;

public class EmailValidator implements Validator {
public class EmailValidator extends SimpleValidator {
@Override
public String validate(Listing poi) {
public void validate(Listing poi) {
if (poi.getEmail() != null && !poi.getEmail().equals("")) {
if (!org.apache.commons.validator.routines.EmailValidator.getInstance().isValid(poi.getEmail())) {
return "Invalid e-mail '" + poi.getEmail() + "'";
poi.add(ValidationIssue.INVALID_EMAIL);
}
}
return null;
}

@Override
public String getIssueType() {
return "E-mail";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,15 @@

import org.wikivoyage.listings.entity.Listing;

public class LatitudeValidator implements Validator {
public class LatitudeValidator extends SimpleValidator {
@Override
public String validate(Listing poi) {
public void validate(Listing poi) {
if (poi.getLatitude() != null && !poi.getLatitude().equals("")) {
try {
Float.parseFloat(poi.getLatitude());
} catch (NumberFormatException e) {
return "Malformed latitude '" + poi.getLatitude() + "'";
poi.add(ValidationIssue.INVALID_LATITUDE);
}
}
return null;
}

@Override
public String getIssueType() {
return "Latitude";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,15 @@

import org.wikivoyage.listings.entity.Listing;

public class LongitudeValidator implements Validator {
public class LongitudeValidator extends SimpleValidator {
@Override
public String validate(Listing poi) {
public void validate(Listing poi) {
if (poi.getLongitude() != null && !poi.getLongitude().equals("")) {
try {
Float.parseFloat(poi.getLongitude());
} catch (NumberFormatException e) {
return "Malformed longitude '" + poi.getLongitude() + "'";
poi.add(ValidationIssue.INVALID_LONGITUDE);
}
}
return null;
}

@Override
public String getIssueType() {
return "Longitude";
}
}
Loading

0 comments on commit 7d2abc7

Please sign in to comment.