Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved SAS scanning #200

Merged
merged 4 commits into from
Feb 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion iniFileExamples/WhiteRabbit.ini
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
WORKING_FOLDER = /users/joe # Path to the folder where all output will be written
# Usage: dist/bin/whiteRabbit -ini <ini_file_path>
WORKING_FOLDER = /users/joe # Path to the folder where all output will be written
DATA_TYPE = PostgreSQL # "Delimited text files", "MySQL", "Oracle", "SQL Server", "PostgreSQL", "MS Access", or "Redshift"
SERVER_LOCATION = 127.0.0.1/data_base_name # Name or address of the server. For Postgres, add the database name
USER_NAME = joe # User name for the database
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,11 @@ public static Database generateModelFromScanReport(String filename) {
index = fieldName2ColumnIndex.get("Max length");
if (index != null && index >= 0 && index < row.size())
field.setMaxLength((int) (Double.parseDouble(row.get(index))));

index = fieldName2ColumnIndex.get("Description");
if (index != null && index >= 0 && index < row.size())
field.setComment(row.get(index));

field.setValueCounts(getValueCounts(workbook, tableName, fieldName));
table.getFields().add(field);
}
Expand Down
5 changes: 5 additions & 0 deletions whiterabbit/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,10 @@
<artifactId>rabbit-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>1.7.21</version>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ public void windowClosing(WindowEvent e) {
}

private void launchCommandLine(String iniFileName) {
// TODO: add option to scan sas7bdat from command line, using ini file
IniFile iniFile = new IniFile(iniFileName);
DbSettings dbSettings = new DbSettings();
if (iniFile.get("DATA_TYPE").equalsIgnoreCase("Delimited text files")) {
Expand All @@ -158,6 +157,8 @@ private void launchCommandLine(String iniFileName) {
dbSettings.delimiter = '\t';
else
dbSettings.delimiter = iniFile.get("DELIMITER").charAt(0);
} else if (iniFile.get("DATA_TYPE").equalsIgnoreCase("SAS7bdat")) {
dbSettings.dataType = DbSettings.SASFILES;
} else {
dbSettings.dataType = DbSettings.DATABASE;
dbSettings.user = iniFile.get("USER_NAME");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,20 +140,20 @@ private void generateReport(Map<String, List<FieldInfo>> tableToFieldInfos, Stri

SXSSFWorkbook workbook = new SXSSFWorkbook(100); // keep 100 rows in memory, exceeding rows will be flushed to disk
CellStyle percentageStyle = workbook.createCellStyle();
percentageStyle.setDataFormat(workbook.createDataFormat().getFormat("0%"));
percentageStyle.setDataFormat(workbook.createDataFormat().getFormat("0.0%"));

// Create overview sheet
Sheet overviewSheet = workbook.createSheet("Overview");
if (!scanValues) {
addRow(overviewSheet, "Table", "Field", "Type", "N rows");
addRow(overviewSheet, "Table", "Field", "Description", "Type", "N rows");
for (String table : tables) {
for (FieldInfo fieldInfo : tableToFieldInfos.get(table)) {
addRow(overviewSheet, table, fieldInfo.name, fieldInfo.getTypeDescription(), Long.valueOf(fieldInfo.rowCount));
addRow(overviewSheet, table, fieldInfo.name, fieldInfo.label, fieldInfo.getTypeDescription(), Long.valueOf(fieldInfo.rowCount));
}
addRow(overviewSheet, "");
}
} else {
addRow(overviewSheet, "Table", "Field", "Type", "Max length", "N rows", "N rows checked", "Fraction empty", "N unique values", "Fraction unique values");
addRow(overviewSheet, "Table", "Field", "Description", "Type", "Max length", "N rows", "N rows checked", "Fraction empty", "N unique values", "Fraction unique values");
int sheetIndex = 0;
Map<String, String> sheetNameLookup = new HashMap<>();
for (String tableName : tables) {
Expand All @@ -166,15 +166,15 @@ private void generateReport(Map<String, List<FieldInfo>> tableToFieldInfos, Stri
for (FieldInfo fieldInfo : tableToFieldInfos.get(tableName)) {
Long uniqueCount = fieldInfo.uniqueCount;
Double fractionUnique = fieldInfo.getFractionUnique();
addRow(overviewSheet, tableNameIndexed, fieldInfo.name, fieldInfo.getTypeDescription(),
addRow(overviewSheet, tableNameIndexed, fieldInfo.name, fieldInfo.label, fieldInfo.getTypeDescription(),
Integer.valueOf(fieldInfo.maxLength),
Long.valueOf(fieldInfo.rowCount),
Long.valueOf(fieldInfo.nProcessed),
fieldInfo.getFractionEmpty(),
fieldInfo.hasValuesTrimmed() ? String.format("<= %d", uniqueCount) : uniqueCount,
fieldInfo.hasValuesTrimmed() ? String.format("<= %.3f", fractionUnique) : fractionUnique
);
this.setCellStyles(overviewSheet, percentageStyle, 6, 8);
this.setCellStyles(overviewSheet, percentageStyle, 7, 9);
}
addRow(overviewSheet, "");
sheetIndex += 1;
Expand Down Expand Up @@ -406,20 +406,31 @@ private List<FieldInfo> processSasFile(String filename) {
try(FileInputStream inputStream = new FileInputStream(new File(filename))) {
SasFileReader sasFileReader = new SasFileReaderImpl(inputStream);

// It is possible to retrieve more information from the sasFileProperties, like data type and length.
SasFileProperties sasFileProperties = sasFileReader.getSasFileProperties();
for (Column column : sasFileReader.getColumns()) {
fieldInfos.add(new FieldInfo(column.getName()));
FieldInfo fieldInfo = new FieldInfo(column.getName());
fieldInfo.label = column.getLabel();
fieldInfo.rowCount = sasFileProperties.getRowCount();
if (!scanValues) {
// Note: type given by sas parser is either NUMBER or STRING.
// If scanning values, this produces a more granular type.
fieldInfo.type = column.getType().getName().replace("java.lang.", "");
}
fieldInfos.add(fieldInfo);
}

for (int lineNr = 0; lineNr < sasFileProperties.getRowCount(); lineNr++) {
Object[] row = sasFileReader.readNext();

if (row.length == fieldInfos.size()) { // Else there appears to be a formatting error, so skip
for (int i = 0; i < row.length; i++) {
fieldInfos.get(i).processValue(row[i] == null ? "" : row[i].toString());
}
if (row.length != fieldInfos.size()) {
// A formatting error, skip
continue;
}

for (int i = 0; i < row.length; i++) {
fieldInfos.get(i).processValue(row[i] == null ? "" : row[i].toString());
}

if (lineNr == sampleSize)
break;
}
Expand All @@ -438,6 +449,7 @@ private List<FieldInfo> processSasFile(String filename) {
private class FieldInfo {
public String type;
public String name;
public String label;
public CountingSet<String> valueCounts = new CountingSet<>();
public long sumLength = 0;
public int maxLength = 0;
Expand Down Expand Up @@ -564,11 +576,13 @@ private void addRow(Sheet sheet, Object... values) {
for (Object value : values) {
Cell cell = row.createCell(row.getPhysicalNumberOfCells());

if (value instanceof Integer || value instanceof Long || value instanceof Double)
if (value instanceof Integer || value instanceof Long || value instanceof Double) {
cell.setCellValue(Double.parseDouble(value.toString()));
else
} else if (value != null) {
cell.setCellValue(value.toString());

} else {
cell.setCellValue("");
}
}
}

Expand Down