Skip to content

Commit

Permalink
CLDR-15923 Integrate UN Literacy Parser, update unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
srl295 committed Feb 23, 2024
1 parent 1763474 commit c9ab6de
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 38 deletions.
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
package org.unicode.cldr.tool;

import com.ibm.icu.text.ListFormat;
import com.ibm.icu.text.NumberFormat;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.Output;
import com.ibm.icu.util.ULocale;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.unicode.cldr.util.CldrUtility;
import org.unicode.cldr.util.CldrUtility.LineHandler;
import org.unicode.cldr.util.Counter2;
import org.unicode.cldr.util.Pair;
import org.unicode.cldr.util.StandardCodes;

import com.ibm.icu.text.ListFormat;
import com.ibm.icu.text.NumberFormat;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

public class AddPopulationData {
static boolean ADD_POP = CldrUtility.getProperty("ADD_POP", false);
static boolean SHOW_ALTERNATE_NAMES = CldrUtility.getProperty("SHOW_ALTERNATE_NAMES", false);
Expand Down Expand Up @@ -480,10 +480,22 @@ public boolean handle(String line) {
}

static void loadUnLiteracy() throws IOException {
for (final Pair<String, Double> p : getUnLiteracy(null)) {
un_literacy.add(p.getFirst(), p.getSecond());
}
}

/**
* @param hadErr on return, true if there were errs
* @return list of code,percent values
* @throws IOException
*/
static List<Pair<String, Double>> getUnLiteracy(Output<Boolean> hadErr) throws IOException {
List<Pair<String, Double>> result = new LinkedList<>();
UnLiteracyParser ulp;
try {
ulp = new UnLiteracyParser().read();
} catch(Throwable t) {
} catch (Throwable t) {
throw new IOException("Could not read UN data " + UnLiteracyParser.UN_LITERACY, t);
}

Expand All @@ -497,6 +509,9 @@ static void loadUnLiteracy() throws IOException {

String code = CountryCodeConverter.getCodeFromName(country, true, missing);
if (code == null) {
if (hadErr != null) {
hadErr.value = true;
}
continue;
}
if (!StandardCodes.isCountry(code)) {
Expand All @@ -507,8 +522,12 @@ static void loadUnLiteracy() throws IOException {
}
double total = literate + illiterate;
double percent = ((double) literate) / total;
un_literacy.add(code, percent);
result.add(Pair.of(code, percent));
}
if (result.isEmpty()) {
hadErr.value = true;
}
return result;
}

static {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
package org.unicode.cldr.tool;

import com.ibm.icu.number.LocalizedNumberFormatter;
import com.ibm.icu.number.NumberFormatter;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;

import org.checkerframework.checker.units.qual.C;
import org.unicode.cldr.util.XMLFileReader;
import org.unicode.cldr.util.XPathParts;

import com.ibm.icu.number.LocalizedNumberFormatter;
import com.ibm.icu.number.NumberFormatter;

public class UnLiteracyParser extends XMLFileReader.SimpleHandler {

private static final String VALUE = "Value";
Expand All @@ -38,9 +35,24 @@ public static void main(String args[]) {
Long unknown = py.total(UNKNOWN);
Long total = py.total(TOTAL);

System.out.println(country + "\t" + latest + "\t" + literate + "/" + illiterate + ", " + unknown + " = " + total);
System.out.println(
country
+ "\t"
+ latest
+ "\t"
+ literate
+ "/"
+ illiterate
+ ", "
+ unknown
+ " = "
+ total);
if ((literate + illiterate + unknown) != total) {
System.out.println("- doesn't add up for " + country + " - total is " + (literate+illiterate+unknown));
System.out.println(
"- doesn't add up for "
+ country
+ " - total is "
+ (literate + illiterate + unknown));
}
}
}
Expand All @@ -49,14 +61,22 @@ public static void main(String args[]) {

// Reading stuff
public static final String UN_LITERACY = "external/un_literacy.xml";

UnLiteracyParser read() {
System.out.println("* Reading " + UN_LITERACY);
new XMLFileReader()
.setHandler(this).readCLDRResource(UN_LITERACY, XMLFileReader.CONTENT_HANDLER, false);
.setHandler(this)
.readCLDRResource(UN_LITERACY, XMLFileReader.CONTENT_HANDLER, false);
// get the final record
handleNewRecord();
LocalizedNumberFormatter nf = NumberFormatter.with().locale(Locale.ENGLISH);
System.out.println("* Read " + nf.format(recCount) + " record(s) with " + nf.format(perCountry.size()) + " region(s) from " + UN_LITERACY);
System.out.println(
"* Read "
+ nf.format(recCount)
+ " record(s) with "
+ nf.format(perCountry.size())
+ " region(s) from "
+ UN_LITERACY);
return this;
}

Expand All @@ -71,19 +91,20 @@ public void handlePathValue(String path, String value) {
}

@Override
public
void handleElement(CharSequence path) {
public void handleElement(CharSequence path) {
if ("//ROOT/data/record".equals(path.toString())) {
handleNewRecord();
}
}

// Data ingestion
final Map<String,String> thisRecord = new HashMap<String,String>();
final Map<String, String> thisRecord = new HashMap<String, String>();

private void handleField(String field, String value) {
final String old = thisRecord.put(field, value);
if (old != null) {
throw new IllegalArgumentException("Duplicate field " + field + ", context: " + thisRecord);
throw new IllegalArgumentException(
"Duplicate field " + field + ", context: " + thisRecord);
}
}

Expand All @@ -98,8 +119,8 @@ private void handleNewRecord() {

boolean validate() {
try {
assertEqual("Area","Total");
assertEqual("Sex","Both Sexes");
assertEqual("Area", "Total");
assertEqual("Sex", "Both Sexes");

assertPresent(AGE);
assertPresent(COUNTRY_OR_AREA);
Expand All @@ -109,7 +130,7 @@ boolean validate() {
assertPresent(RELIABILITY);

return true;
} catch(Throwable t) {
} catch (Throwable t) {
final String context = thisRecord.toString();
throw new IllegalArgumentException("While parsing " + context, t);
}
Expand All @@ -119,7 +140,7 @@ void assertPresent(String field) {
String value = get(field);
if (value == null) {
throw new NullPointerException("Missing field: " + field);
} else if(value.isEmpty()) {
} else if (value.isEmpty()) {
throw new NullPointerException("Empty field: " + field);
}
}
Expand All @@ -128,7 +149,8 @@ void assertEqual(String field, String expected) {
assertPresent(field);
String value = get(field);
if (!value.equals(expected)) {
throw new NullPointerException("Expected " + field + "=" + expected + " but got " + value);
throw new NullPointerException(
"Expected " + field + "=" + expected + " but got " + value);
}
}

Expand All @@ -144,14 +166,19 @@ private void handleRecord() {
final String age = get(AGE);
final String literacy = get(LITERACY);
final String reliability = get(RELIABILITY);
final PerAge pa = perCountry.computeIfAbsent(country, (String c) -> new PerCountry())
.perYear.computeIfAbsent(year, (String y) -> new PerYear())
.perAge.computeIfAbsent(age, (String a) -> new PerAge());
final PerAge pa =
perCountry
.computeIfAbsent(country, (String c) -> new PerCountry())
.perYear
.computeIfAbsent(year, (String y) -> new PerYear())
.perAge
.computeIfAbsent(age, (String a) -> new PerAge());

if (pa.reliability == null) {
pa.reliability = reliability;
} else if (!pa.reliability.equals(reliability)) {
throw new IllegalArgumentException("Inconsistent reliability " + reliability + " for " + thisRecord);
throw new IllegalArgumentException(
"Inconsistent reliability " + reliability + " for " + thisRecord);
}
final Long old = pa.perLiteracy.put(literacy, getLongValue());
if (old != null) {
Expand All @@ -161,7 +188,8 @@ private void handleRecord() {

private long getLongValue() {
final String value = get(VALUE);
if (value.contains(".")) { // yes. some of the data has decimal points. Ignoring the fractional part.
if (value.contains(
".")) { // yes. some of the data has decimal points. Ignoring the fractional part.
return Long.parseLong(value.split("\\.")[0]);
} else {
return Long.parseLong(value);
Expand All @@ -171,18 +199,21 @@ private long getLongValue() {
final Map<String, PerCountry> perCountry = new TreeMap<String, PerCountry>();

final class PerCountry {
final Map<String, PerYear> perYear = new TreeMap<String,PerYear>();
final Map<String, PerYear> perYear = new TreeMap<String, PerYear>();

public String latest() {
final String y[] = perYear.keySet().toArray(new String[0]);
return y[y.length-1];
return y[y.length - 1];
}
}

final class PerYear {
final Map<String, PerAge> perAge = new TreeMap<String, PerAge>();

Long total(String literacy) {
return perAge.values().stream().map((pa) -> pa.perLiteracy.getOrDefault(literacy, 0L)).reduce(0L, (Long a, Long b) -> a + b);
return perAge.values().stream()
.map((pa) -> pa.perLiteracy.getOrDefault(literacy, 0L))
.reduce(0L, (Long a, Long b) -> a + b);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,10 @@ public class XMLFileReader {
private SimpleHandler simpleHandler;

public static class SimpleHandler {
/** called when every new element is encountered, with the full path to the element (including attributes).
* Called on leaf and non-leaf elements.
/**
* called when every new element is encountered, with the full path to the element
* (including attributes). Called on leaf and non-leaf elements.
*
* @param path
*/
public void handleElement(CharSequence path) {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ HK; Hong Kong SAR China; China Hong Kong
HK; Hong Kong SAR China; Hong Kong
HK; Hong Kong SAR China; Hong Kong, China
HK; Hong Kong SAR China; Hong Kong SAR, China
HK; Hong Kong SAR China; China, Hong Kong SAR

IR; Iran; Iran, Islamic Rep.
IR; Iran; Iran, Islamic Republic of
Expand Down Expand Up @@ -115,6 +116,8 @@ MO; Macau SAR China; Macao, China
MO; Macau SAR China; Macau
MO; Macau SAR China; China, Macao Special Administrative Region
MO; Macau SAR China; Macao SAR, China
MO; Macau SAR China; China, Macao SAR


PN; Pitcairn Islands; Pitcairn Islands

Expand All @@ -140,6 +143,7 @@ SZ; Eswatini; eSwatini; Swaziland
SZ; Eswatini; Swaziland

SH; Saint Helena; Saint Helena, Ascension, and Tristan da Cunha
SH; Saint Helena; Saint Helena ex. dep.

TL; East Timor; Timor-Leste
TL; East Timor; East Timor
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,27 @@
package org.unicode.cldr.tool;

import java.io.IOException;
import static org.junit.jupiter.api.Assertions.assertFalse;

import com.ibm.icu.util.Output;
import java.io.IOException;
import java.util.List;
import org.junit.jupiter.api.Test;
import org.unicode.cldr.util.Pair;

public class TestAddPopulationData {
@Test
public void TestParseUnStats() throws IOException {
AddPopulationData.loadUnLiteracy();
Output<Boolean> err = new Output<>(false);
// this is already run once during static init. we run it again to capture err value
List<Pair<String, Double>> unLiteracy = AddPopulationData.getUnLiteracy(err);
assertFalse(
err.value,
"getUnLiteracy() returned errs - check err log for 'ERROR: CountryCodeConverter'");
assertFalse(unLiteracy.isEmpty(), "un literacy shouldn't be empty");
// optionally dump out values
if (false)
for (final Pair<String, Double> p : unLiteracy) {
System.out.println(p.getFirst() + " - " + p.getSecond());
}
}
}

0 comments on commit c9ab6de

Please sign in to comment.