Skip to content

Commit

Permalink
Merge pull request #195 from thehyve/wr-scan-sas
Browse files Browse the repository at this point in the history
Scan SAS files
  • Loading branch information
Maxim Moinat authored Dec 24, 2019
2 parents 9d7f595 + faaeb76 commit 9656b4f
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 52 deletions.
5 changes: 5 additions & 0 deletions rabbit-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -198,5 +198,10 @@
<artifactId>avro</artifactId>
<version>1.8.2</version>
</dependency>
<dependency>
<groupId>com.epam</groupId>
<artifactId>parso</artifactId>
<version>2.0</version>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
public class DbSettings {
public static int DATABASE = 1;
public static int CSVFILES = 2;

public static int SASFILES = 3;

public int dataType;
public List<String> tables = new ArrayList<String>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ public class WhiteRabbitMain implements ActionListener {
private JList<String> tableList;
private Vector<String> tables = new Vector<String>();
private boolean sourceIsFiles = true;
private boolean sourceIsSas = false;
private boolean targetIsFiles = false;

private List<JComponent> componentsToDisableWhenRunning = new ArrayList<JComponent>();
Expand Down Expand Up @@ -148,6 +149,7 @@ public void windowClosing(WindowEvent e) {
}

private void launchCommandLine(String iniFileName) {
// TODO: add option to scan sas7bdat from command line, using ini file
IniFile iniFile = new IniFile(iniFileName);
DbSettings dbSettings = new DbSettings();
if (iniFile.get("DATA_TYPE").equalsIgnoreCase("Delimited text files")) {
Expand Down Expand Up @@ -266,47 +268,45 @@ public void actionPerformed(ActionEvent e) {
sourcePanel.setLayout(new GridLayout(0, 2));
sourcePanel.setBorder(BorderFactory.createTitledBorder("Source data location"));
sourcePanel.add(new JLabel("Data type"));
sourceType = new JComboBox<String>(new String[] { "Delimited text files", "MySQL", "Oracle", "SQL Server", "PostgreSQL", "MS Access", "PDW", "Redshift", "Teradata", "BigQuery" });
sourceType = new JComboBox<>(new String[] { "Delimited text files", "SAS7bdat", "MySQL", "Oracle", "SQL Server", "PostgreSQL", "MS Access", "PDW", "Redshift", "Teradata", "BigQuery" });
sourceType.setToolTipText("Select the type of source data available");
sourceType.addItemListener(new ItemListener() {

@Override
public void itemStateChanged(ItemEvent arg0) {
sourceIsFiles = arg0.getItem().toString().equals("Delimited text files");
sourceServerField.setEnabled(!sourceIsFiles);
sourceUserField.setEnabled(!sourceIsFiles);
sourcePasswordField.setEnabled(!sourceIsFiles);
sourceDatabaseField.setEnabled(!sourceIsFiles);
sourceDelimiterField.setEnabled(sourceIsFiles);
addAllButton.setEnabled(!sourceIsFiles);

if (!sourceIsFiles && arg0.getItem().toString().equals("Oracle")) {
sourceServerField
.setToolTipText("For Oracle servers this field contains the SID, servicename, and optionally the port: '<host>/<sid>', '<host>:<port>/<sid>', '<host>/<service name>', or '<host>:<port>/<service name>'");
sourceUserField.setToolTipText("For Oracle servers this field contains the name of the user used to log in");
sourcePasswordField.setToolTipText("For Oracle servers this field contains the password corresponding to the user");
sourceDatabaseField
.setToolTipText("For Oracle servers this field contains the schema (i.e. 'user' in Oracle terms) containing the source tables");
} else if (!sourceIsFiles && arg0.getItem().toString().equals("PostgreSQL")) {
sourceServerField.setToolTipText("For PostgreSQL servers this field contains the host name and database name (<host>/<database>)");
sourceType.addItemListener(itemEvent -> {
String selectedSourceType = itemEvent.getItem().toString();
sourceIsFiles = selectedSourceType.equals("Delimited text files");
sourceIsSas = selectedSourceType.equals("SAS7bdat");
boolean sourceIsDatabase = !(sourceIsFiles || sourceIsSas);

sourceServerField.setEnabled(sourceIsDatabase);
sourceUserField.setEnabled(sourceIsDatabase);
sourcePasswordField.setEnabled(sourceIsDatabase);
sourceDatabaseField.setEnabled(sourceIsDatabase);
sourceDelimiterField.setEnabled(sourceIsFiles);
addAllButton.setEnabled(sourceIsDatabase);

if (sourceIsDatabase && selectedSourceType.equals("Oracle")) {
sourceServerField.setToolTipText("For Oracle servers this field contains the SID, servicename, and optionally the port: '<host>/<sid>', '<host>:<port>/<sid>', '<host>/<service name>', or '<host>:<port>/<service name>'");
sourceUserField.setToolTipText("For Oracle servers this field contains the name of the user used to log in");
sourcePasswordField.setToolTipText("For Oracle servers this field contains the password corresponding to the user");
sourceDatabaseField.setToolTipText("For Oracle servers this field contains the schema (i.e. 'user' in Oracle terms) containing the source tables");
} else if (sourceIsDatabase && selectedSourceType.equals("PostgreSQL")) {
sourceServerField.setToolTipText("For PostgreSQL servers this field contains the host name and database name (<host>/<database>)");
sourceUserField.setToolTipText("The user used to log in to the server");
sourcePasswordField.setToolTipText("The password used to log in to the server");
sourceDatabaseField.setToolTipText("For PostgreSQL servers this field contains the schema containing the source tables");
} else if (sourceIsDatabase && selectedSourceType.equals("BigQuery")) {
sourceServerField.setToolTipText("GBQ SA & UA: ProjectID");
sourceUserField.setToolTipText("GBQ SA only: OAuthServiceAccountEMAIL");
sourcePasswordField.setToolTipText("GBQ SA only: OAuthPvtKeyPath");
sourceDatabaseField.setToolTipText("GBQ SA & UA: Data Set within ProjectID");
} else if (sourceIsDatabase) {
sourceServerField.setToolTipText("This field contains the name or IP address of the database server");
if (selectedSourceType.equals("SQL Server")) {
sourceUserField.setToolTipText("The user used to log in to the server. Optionally, the domain can be specified as <domain>/<user> (e.g. 'MyDomain/Joe')");
} else {
sourceUserField.setToolTipText("The user used to log in to the server");
sourcePasswordField.setToolTipText("The password used to log in to the server");
sourceDatabaseField.setToolTipText("For PostgreSQL servers this field contains the schema containing the source tables");
} else if (!sourceIsFiles && arg0.getItem().toString().equals("BigQuery")) {
sourceServerField.setToolTipText("GBQ SA & UA: ProjectID");
sourceUserField.setToolTipText("GBQ SA only: OAuthServiceAccountEMAIL");
sourcePasswordField.setToolTipText("GBQ SA only: OAuthPvtKeyPath");
sourceDatabaseField.setToolTipText("GBQ SA & UA: Data Set within ProjectID");
} else if (!sourceIsFiles) {
sourceServerField.setToolTipText("This field contains the name or IP address of the database server");
if (arg0.getItem().toString().equals("SQL Server"))
sourceUserField
.setToolTipText("The user used to log in to the server. Optionally, the domain can be specified as <domain>/<user> (e.g. 'MyDomain/Joe')");
else
sourceUserField.setToolTipText("The user used to log in to the server");
sourcePasswordField.setToolTipText("The password used to log in to the server");
sourceDatabaseField.setToolTipText("The name of the database containing the source tables");
}
sourcePasswordField.setToolTipText("The password used to log in to the server");
sourceDatabaseField.setToolTipText("The name of the database containing the source tables");
}
});
sourcePanel.add(sourceType);
Expand Down Expand Up @@ -469,6 +469,7 @@ public void actionPerformed(ActionEvent e) {
}

private JPanel createFakeDataPanel() {
// TODO: add sas7bdat as target for fake data.
JPanel panel = new JPanel();

panel.setLayout(new GridBagLayout());
Expand Down Expand Up @@ -694,12 +695,16 @@ private void addAllTables() {
private void pickTables() {
DbSettings sourceDbSettings = getSourceDbSettings();
if (sourceDbSettings != null) {
if (sourceDbSettings.dataType == DbSettings.CSVFILES) {
if (sourceDbSettings.dataType == DbSettings.CSVFILES || sourceDbSettings.dataType == DbSettings.SASFILES) {
JFileChooser fileChooser = new JFileChooser(new File(folderField.getText()));
fileChooser.setMultiSelectionEnabled(true);
fileChooser.setFileSelectionMode(JFileChooser.FILES_ONLY);
FileNameExtensionFilter filter = new FileNameExtensionFilter("Delimited text files", "csv", "txt");
fileChooser.setFileFilter(filter);

if (sourceDbSettings.dataType == DbSettings.CSVFILES) {
fileChooser.setFileFilter(new FileNameExtensionFilter("Delimited text files", "csv", "txt"));
} else if (sourceDbSettings.dataType == DbSettings.SASFILES) {
fileChooser.setFileFilter(new FileNameExtensionFilter("SAS Data Files", "sas7bdat"));
}

int returnVal = fileChooser.showDialog(frame, "Select tables");
if (returnVal == JFileChooser.APPROVE_OPTION) {
Expand Down Expand Up @@ -746,6 +751,8 @@ private DbSettings getSourceDbSettings() {
dbSettings.delimiter = '\t';
else
dbSettings.delimiter = sourceDelimiterField.getText().charAt(0);
} else if (sourceType.getSelectedItem().equals("SAS7bdat")) {
dbSettings.dataType = DbSettings.SASFILES;
} else {
dbSettings.dataType = DbSettings.DATABASE;
dbSettings.user = sourceUserField.getText();
Expand Down Expand Up @@ -789,7 +796,7 @@ else if (sourceType.getSelectedItem().toString().equals("Teradata"))
}

private void testConnection(DbSettings dbSettings) {
if (dbSettings.dataType == DbSettings.CSVFILES) {
if (dbSettings.dataType == DbSettings.CSVFILES || dbSettings.dataType == DbSettings.SASFILES) {
if (new File(folderField.getText()).exists()) {
String message = "Folder " + folderField.getText() + " found";
JOptionPane.showMessageDialog(frame, StringUtilities.wordWrap(message, 80), "Working folder found", JOptionPane.INFORMATION_MESSAGE);
Expand Down Expand Up @@ -902,7 +909,7 @@ else if (sourceType.getSelectedItem().toString().equals("SQL Server")) {

private void scanRun() {
if (tables.size() == 0) {
if (sourceIsFiles) {
if (sourceIsFiles || sourceIsSas) {
String message = "No files selected for scanning";
JOptionPane.showMessageDialog(frame, StringUtilities.wordWrap(message, 80), "No files selected", JOptionPane.ERROR_MESSAGE);
return;
Expand Down Expand Up @@ -971,7 +978,7 @@ public void run() {
DbSettings dbSettings = getSourceDbSettings();
if (dbSettings != null) {
for (String table : tables) {
if (dbSettings.dataType == DbSettings.CSVFILES)
if (dbSettings.dataType == DbSettings.CSVFILES || dbSettings.dataType == DbSettings.SASFILES)
table = folderField.getText() + "/" + table;
dbSettings.tables.add(table);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.ohdsi.whiteRabbit.scan;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.sql.ResultSet;
Expand All @@ -31,6 +32,10 @@
import java.util.function.Function;
import java.util.stream.Collectors;

import com.epam.parso.Column;
import com.epam.parso.SasFileProperties;
import com.epam.parso.SasFileReader;
import com.epam.parso.impl.SasFileReaderImpl;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.Row;
Expand Down Expand Up @@ -72,16 +77,20 @@ public void process(DbSettings dbSettings, int sampleSize, boolean scanValues, i
if (!scanValues)
this.minCellCount = Math.max(minCellCount, MIN_CELL_COUNT_FOR_CSV);
tableToFieldInfos = processCsvFiles(dbSettings);
} else
} else if (dbSettings.dataType == DbSettings.SASFILES) {
tableToFieldInfos = processSasFiles(dbSettings);
} else {
tableToFieldInfos = processDatabase(dbSettings);
}
generateReport(tableToFieldInfos, filename);
}

private Map<String, List<FieldInfo>> processDatabase(DbSettings dbSettings) {
// GBQ requires database. Put database value into domain var
if (dbSettings.dbType == DbType.BIGQUERY) {
// GBQ requires database. Put database value into domain var
dbSettings.domain = dbSettings.database;
};

try (RichConnection connection = new RichConnection(dbSettings.server, dbSettings.domain, dbSettings.user, dbSettings.password, dbSettings.dbType)) {
connection.setVerbose(false);
connection.use(dbSettings.database);
Expand All @@ -91,13 +100,12 @@ private Map<String, List<FieldInfo>> processDatabase(DbSettings dbSettings) {

return dbSettings.tables.stream()
.collect(Collectors.toMap(Function.identity(), table -> processDatabaseTable(table, connection)));

}
}

private Map<String, List<FieldInfo>> processCsvFiles(DbSettings dbSettings) {
delimiter = dbSettings.delimiter;
Map<String, List<FieldInfo>> tableToFieldInfos = new HashMap<String, List<FieldInfo>>();
Map<String, List<FieldInfo>> tableToFieldInfos = new HashMap<>();
for (String table : dbSettings.tables) {
List<FieldInfo> fieldInfos = processCsvFile(table);
String tableName = new File(table).getName();
Expand All @@ -106,7 +114,20 @@ private Map<String, List<FieldInfo>> processCsvFiles(DbSettings dbSettings) {
} else {
tableToFieldInfos.put(table, fieldInfos);
}
}
return tableToFieldInfos;
}

private Map<String, List<FieldInfo>> processSasFiles(DbSettings dbSettings) {
Map<String, List<FieldInfo>> tableToFieldInfos = new HashMap<>();
for (String fileName : dbSettings.tables) {
List<FieldInfo> fieldInfos = processSasFile(fileName);
String tableName = new File(fileName).getName();
if (!tableToFieldInfos.containsKey(tableName)) {
tableToFieldInfos.put(tableName, fieldInfos);
} else {
tableToFieldInfos.put(fileName, fieldInfos);
}
}
return tableToFieldInfos;
}
Expand Down Expand Up @@ -348,7 +369,7 @@ else if (dbType == DbType.BIGQUERY) {

private List<FieldInfo> processCsvFile(String filename) {
StringUtilities.outputWithTime("Scanning table " + filename);
List<FieldInfo> fieldInfos = new ArrayList<FieldInfo>();
List<FieldInfo> fieldInfos = new ArrayList<>();
int lineNr = 0;
for (String line : new ReadTextFile(filename)) {
lineNr++;
Expand All @@ -369,7 +390,7 @@ private List<FieldInfo> processCsvFile(String filename) {
fieldInfos.get(i).processValue(row.get(i));
}
}
if (sampleSize != -1 && lineNr == sampleSize)
if (lineNr == sampleSize)
break;
}
for (FieldInfo fieldInfo : fieldInfos)
Expand All @@ -378,10 +399,46 @@ private List<FieldInfo> processCsvFile(String filename) {
return fieldInfos;
}

private List<FieldInfo> processSasFile(String filename) {
StringUtilities.outputWithTime("Scanning table " + filename);
List<FieldInfo> fieldInfos = new ArrayList<>();

try(FileInputStream inputStream = new FileInputStream(new File(filename))) {
SasFileReader sasFileReader = new SasFileReaderImpl(inputStream);

// It is possible to retrieve more information from the sasFileProperties, like data type and length.
SasFileProperties sasFileProperties = sasFileReader.getSasFileProperties();
for (Column column : sasFileReader.getColumns()) {
fieldInfos.add(new FieldInfo(column.getName()));
}

for (int lineNr = 0; lineNr < sasFileProperties.getRowCount(); lineNr++) {
Object[] row = sasFileReader.readNext();

if (row.length == fieldInfos.size()) { // Else there appears to be a formatting error, so skip
for (int i = 0; i < row.length; i++) {
fieldInfos.get(i).processValue(row[i] == null ? "" : row[i].toString());
}
}
if (lineNr == sampleSize)
break;
}
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}

for (FieldInfo fieldInfo : fieldInfos) {
fieldInfo.trim();
}

return fieldInfos;
}

private class FieldInfo {
public String type;
public String name;
public CountingSet<String> valueCounts = new CountingSet<String>();
public CountingSet<String> valueCounts = new CountingSet<>();
public long sumLength = 0;
public int maxLength = 0;
public long nProcessed = 0;
Expand Down

0 comments on commit 9656b4f

Please sign in to comment.