Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scan SAS files #195

Merged
merged 3 commits into from
Dec 24, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions rabbit-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -198,5 +198,10 @@
<artifactId>avro</artifactId>
<version>1.8.2</version>
</dependency>
<dependency>
<groupId>com.epam</groupId>
<artifactId>parso</artifactId>
<version>2.0</version>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
public class DbSettings {
public static int DATABASE = 1;
public static int CSVFILES = 2;

public static int SASFILES = 3;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to make this an enum? Alternatively, you could distinguish on “database” vs “file” as source type, and for each source type be able to select a format (db: postgres; file: csv)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not implementing this atm, would affect a lot of parts of the code


public int dataType;
public List<String> tables = new ArrayList<String>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ public class WhiteRabbitMain implements ActionListener {
private JList<String> tableList;
private Vector<String> tables = new Vector<String>();
private boolean sourceIsFiles = true;
private boolean sourceIsSas = false;
private boolean targetIsFiles = false;

private List<JComponent> componentsToDisableWhenRunning = new ArrayList<JComponent>();
Expand Down Expand Up @@ -148,6 +149,7 @@ public void windowClosing(WindowEvent e) {
}

private void launchCommandLine(String iniFileName) {
// TODO: add option to scan sas7bdat from command line, using ini file
IniFile iniFile = new IniFile(iniFileName);
DbSettings dbSettings = new DbSettings();
if (iniFile.get("DATA_TYPE").equalsIgnoreCase("Delimited text files")) {
Expand Down Expand Up @@ -266,47 +268,43 @@ public void actionPerformed(ActionEvent e) {
sourcePanel.setLayout(new GridLayout(0, 2));
sourcePanel.setBorder(BorderFactory.createTitledBorder("Source data location"));
sourcePanel.add(new JLabel("Data type"));
sourceType = new JComboBox<String>(new String[] { "Delimited text files", "MySQL", "Oracle", "SQL Server", "PostgreSQL", "MS Access", "PDW", "Redshift", "Teradata", "BigQuery" });
sourceType = new JComboBox<>(new String[] { "Delimited text files", "SAS7bdat", "MySQL", "Oracle", "SQL Server", "PostgreSQL", "MS Access", "PDW", "Redshift", "Teradata", "BigQuery" });
sourceType.setToolTipText("Select the type of source data available");
sourceType.addItemListener(new ItemListener() {

@Override
public void itemStateChanged(ItemEvent arg0) {
sourceIsFiles = arg0.getItem().toString().equals("Delimited text files");
sourceServerField.setEnabled(!sourceIsFiles);
sourceUserField.setEnabled(!sourceIsFiles);
sourcePasswordField.setEnabled(!sourceIsFiles);
sourceDatabaseField.setEnabled(!sourceIsFiles);
sourceDelimiterField.setEnabled(sourceIsFiles);
addAllButton.setEnabled(!sourceIsFiles);

if (!sourceIsFiles && arg0.getItem().toString().equals("Oracle")) {
sourceServerField
.setToolTipText("For Oracle servers this field contains the SID, servicename, and optionally the port: '<host>/<sid>', '<host>:<port>/<sid>', '<host>/<service name>', or '<host>:<port>/<service name>'");
sourceUserField.setToolTipText("For Oracle servers this field contains the name of the user used to log in");
sourcePasswordField.setToolTipText("For Oracle servers this field contains the password corresponding to the user");
sourceDatabaseField
.setToolTipText("For Oracle servers this field contains the schema (i.e. 'user' in Oracle terms) containing the source tables");
} else if (!sourceIsFiles && arg0.getItem().toString().equals("PostgreSQL")) {
sourceServerField.setToolTipText("For PostgreSQL servers this field contains the host name and database name (<host>/<database>)");
sourceType.addItemListener(itemEvent -> {
sourceIsFiles = itemEvent.getItem().toString().equals("Delimited text files");
MaximMoinat marked this conversation as resolved.
Show resolved Hide resolved
sourceIsSas = itemEvent.getItem().toString().equals("SAS7bdat");
boolean sourceIsDatabase = !(sourceIsFiles || sourceIsSas);
sourceServerField.setEnabled(sourceIsDatabase);
sourceUserField.setEnabled(sourceIsDatabase);
sourcePasswordField.setEnabled(sourceIsDatabase);
sourceDatabaseField.setEnabled(sourceIsDatabase);
sourceDelimiterField.setEnabled(sourceIsFiles);
addAllButton.setEnabled(sourceIsDatabase);

if (sourceIsDatabase && itemEvent.getItem().toString().equals("Oracle")) {
sourceServerField.setToolTipText("For Oracle servers this field contains the SID, servicename, and optionally the port: '<host>/<sid>', '<host>:<port>/<sid>', '<host>/<service name>', or '<host>:<port>/<service name>'");
sourceUserField.setToolTipText("For Oracle servers this field contains the name of the user used to log in");
sourcePasswordField.setToolTipText("For Oracle servers this field contains the password corresponding to the user");
sourceDatabaseField.setToolTipText("For Oracle servers this field contains the schema (i.e. 'user' in Oracle terms) containing the source tables");
} else if (sourceIsDatabase && itemEvent.getItem().toString().equals("PostgreSQL")) {
sourceServerField.setToolTipText("For PostgreSQL servers this field contains the host name and database name (<host>/<database>)");
sourceUserField.setToolTipText("The user used to log in to the server");
sourcePasswordField.setToolTipText("The password used to log in to the server");
sourceDatabaseField.setToolTipText("For PostgreSQL servers this field contains the schema containing the source tables");
} else if (sourceIsDatabase && itemEvent.getItem().toString().equals("BigQuery")) {
sourceServerField.setToolTipText("GBQ SA & UA: ProjectID");
sourceUserField.setToolTipText("GBQ SA only: OAuthServiceAccountEMAIL");
sourcePasswordField.setToolTipText("GBQ SA only: OAuthPvtKeyPath");
sourceDatabaseField.setToolTipText("GBQ SA & UA: Data Set within ProjectID");
} else if (sourceIsDatabase) {
sourceServerField.setToolTipText("This field contains the name or IP address of the database server");
if (itemEvent.getItem().toString().equals("SQL Server")) {
sourceUserField.setToolTipText("The user used to log in to the server. Optionally, the domain can be specified as <domain>/<user> (e.g. 'MyDomain/Joe')");
} else {
sourceUserField.setToolTipText("The user used to log in to the server");
sourcePasswordField.setToolTipText("The password used to log in to the server");
sourceDatabaseField.setToolTipText("For PostgreSQL servers this field contains the schema containing the source tables");
} else if (!sourceIsFiles && arg0.getItem().toString().equals("BigQuery")) {
sourceServerField.setToolTipText("GBQ SA & UA: ProjectID");
sourceUserField.setToolTipText("GBQ SA only: OAuthServiceAccountEMAIL");
sourcePasswordField.setToolTipText("GBQ SA only: OAuthPvtKeyPath");
sourceDatabaseField.setToolTipText("GBQ SA & UA: Data Set within ProjectID");
} else if (!sourceIsFiles) {
sourceServerField.setToolTipText("This field contains the name or IP address of the database server");
if (arg0.getItem().toString().equals("SQL Server"))
sourceUserField
.setToolTipText("The user used to log in to the server. Optionally, the domain can be specified as <domain>/<user> (e.g. 'MyDomain/Joe')");
else
sourceUserField.setToolTipText("The user used to log in to the server");
sourcePasswordField.setToolTipText("The password used to log in to the server");
sourceDatabaseField.setToolTipText("The name of the database containing the source tables");
}
sourcePasswordField.setToolTipText("The password used to log in to the server");
sourceDatabaseField.setToolTipText("The name of the database containing the source tables");
}
});
sourcePanel.add(sourceType);
Expand Down Expand Up @@ -469,6 +467,7 @@ public void actionPerformed(ActionEvent e) {
}

private JPanel createFakeDataPanel() {
// TODO: add sas7bdat as target for fake data.
JPanel panel = new JPanel();

panel.setLayout(new GridBagLayout());
Expand Down Expand Up @@ -694,12 +693,16 @@ private void addAllTables() {
private void pickTables() {
DbSettings sourceDbSettings = getSourceDbSettings();
if (sourceDbSettings != null) {
if (sourceDbSettings.dataType == DbSettings.CSVFILES) {
if (sourceDbSettings.dataType == DbSettings.CSVFILES || sourceDbSettings.dataType == DbSettings.SASFILES) {
JFileChooser fileChooser = new JFileChooser(new File(folderField.getText()));
fileChooser.setMultiSelectionEnabled(true);
fileChooser.setFileSelectionMode(JFileChooser.FILES_ONLY);
FileNameExtensionFilter filter = new FileNameExtensionFilter("Delimited text files", "csv", "txt");
fileChooser.setFileFilter(filter);

if (sourceDbSettings.dataType == DbSettings.CSVFILES) {
fileChooser.setFileFilter(new FileNameExtensionFilter("Delimited text files", "csv", "txt"));
} else if (sourceDbSettings.dataType == DbSettings.SASFILES) {
fileChooser.setFileFilter(new FileNameExtensionFilter("SAS Data Files", "sas7bdat"));
}

int returnVal = fileChooser.showDialog(frame, "Select tables");
if (returnVal == JFileChooser.APPROVE_OPTION) {
Expand Down Expand Up @@ -746,6 +749,8 @@ private DbSettings getSourceDbSettings() {
dbSettings.delimiter = '\t';
else
dbSettings.delimiter = sourceDelimiterField.getText().charAt(0);
} else if (sourceType.getSelectedItem().equals("SAS7bdat")) {
dbSettings.dataType = DbSettings.SASFILES;
} else {
dbSettings.dataType = DbSettings.DATABASE;
dbSettings.user = sourceUserField.getText();
Expand Down Expand Up @@ -789,7 +794,7 @@ else if (sourceType.getSelectedItem().toString().equals("Teradata"))
}

private void testConnection(DbSettings dbSettings) {
if (dbSettings.dataType == DbSettings.CSVFILES) {
if (dbSettings.dataType == DbSettings.CSVFILES || dbSettings.dataType == DbSettings.SASFILES) {
if (new File(folderField.getText()).exists()) {
String message = "Folder " + folderField.getText() + " found";
JOptionPane.showMessageDialog(frame, StringUtilities.wordWrap(message, 80), "Working folder found", JOptionPane.INFORMATION_MESSAGE);
Expand Down Expand Up @@ -902,7 +907,7 @@ else if (sourceType.getSelectedItem().toString().equals("SQL Server")) {

private void scanRun() {
if (tables.size() == 0) {
if (sourceIsFiles) {
if (sourceIsFiles || sourceIsSas) {
String message = "No files selected for scanning";
JOptionPane.showMessageDialog(frame, StringUtilities.wordWrap(message, 80), "No files selected", JOptionPane.ERROR_MESSAGE);
return;
Expand Down Expand Up @@ -971,7 +976,7 @@ public void run() {
DbSettings dbSettings = getSourceDbSettings();
if (dbSettings != null) {
for (String table : tables) {
if (dbSettings.dataType == DbSettings.CSVFILES)
if (dbSettings.dataType == DbSettings.CSVFILES || dbSettings.dataType == DbSettings.SASFILES)
table = folderField.getText() + "/" + table;
dbSettings.tables.add(table);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.ohdsi.whiteRabbit.scan;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.sql.ResultSet;
Expand All @@ -31,6 +32,10 @@
import java.util.function.Function;
import java.util.stream.Collectors;

import com.epam.parso.Column;
import com.epam.parso.SasFileProperties;
import com.epam.parso.SasFileReader;
import com.epam.parso.impl.SasFileReaderImpl;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.Row;
Expand Down Expand Up @@ -72,16 +77,20 @@ public void process(DbSettings dbSettings, int sampleSize, boolean scanValues, i
if (!scanValues)
this.minCellCount = Math.max(minCellCount, MIN_CELL_COUNT_FOR_CSV);
tableToFieldInfos = processCsvFiles(dbSettings);
} else
} else if (dbSettings.dataType == DbSettings.SASFILES) {
tableToFieldInfos = processSasFiles(dbSettings);
} else {
tableToFieldInfos = processDatabase(dbSettings);
}
generateReport(tableToFieldInfos, filename);
}

private Map<String, List<FieldInfo>> processDatabase(DbSettings dbSettings) {
// GBQ requires database. Put database value into domain var
if (dbSettings.dbType == DbType.BIGQUERY) {
// GBQ requires database. Put database value into domain var
dbSettings.domain = dbSettings.database;
};

try (RichConnection connection = new RichConnection(dbSettings.server, dbSettings.domain, dbSettings.user, dbSettings.password, dbSettings.dbType)) {
connection.setVerbose(false);
connection.use(dbSettings.database);
Expand All @@ -91,13 +100,12 @@ private Map<String, List<FieldInfo>> processDatabase(DbSettings dbSettings) {

return dbSettings.tables.stream()
.collect(Collectors.toMap(Function.identity(), table -> processDatabaseTable(table, connection)));

}
}

private Map<String, List<FieldInfo>> processCsvFiles(DbSettings dbSettings) {
delimiter = dbSettings.delimiter;
Map<String, List<FieldInfo>> tableToFieldInfos = new HashMap<String, List<FieldInfo>>();
Map<String, List<FieldInfo>> tableToFieldInfos = new HashMap<>();
for (String table : dbSettings.tables) {
List<FieldInfo> fieldInfos = processCsvFile(table);
String tableName = new File(table).getName();
Expand All @@ -106,7 +114,20 @@ private Map<String, List<FieldInfo>> processCsvFiles(DbSettings dbSettings) {
} else {
tableToFieldInfos.put(table, fieldInfos);
}
}
return tableToFieldInfos;
}

private Map<String, List<FieldInfo>> processSasFiles(DbSettings dbSettings) {
Map<String, List<FieldInfo>> tableToFieldInfos = new HashMap<>();
for (String fileName : dbSettings.tables) {
List<FieldInfo> fieldInfos = processSasFile(fileName);
String tableName = new File(fileName).getName();
if (!tableToFieldInfos.containsKey(tableName)) {
tableToFieldInfos.put(tableName, fieldInfos);
} else {
tableToFieldInfos.put(fileName, fieldInfos);
}
}
return tableToFieldInfos;
}
Expand Down Expand Up @@ -348,7 +369,7 @@ else if (dbType == DbType.BIGQUERY) {

private List<FieldInfo> processCsvFile(String filename) {
StringUtilities.outputWithTime("Scanning table " + filename);
List<FieldInfo> fieldInfos = new ArrayList<FieldInfo>();
List<FieldInfo> fieldInfos = new ArrayList<>();
int lineNr = 0;
for (String line : new ReadTextFile(filename)) {
lineNr++;
Expand Down Expand Up @@ -378,10 +399,50 @@ private List<FieldInfo> processCsvFile(String filename) {
return fieldInfos;
}

private List<FieldInfo> processSasFile(String filename) {
StringUtilities.outputWithTime("Scanning table " + filename);
List<FieldInfo> fieldInfos = new ArrayList<>();

// TODO: try with resources and print warning on exception
FileInputStream inputStream;
try {
inputStream = new FileInputStream(new File(filename));

SasFileReader sasFileReader = new SasFileReaderImpl(inputStream);

// TODO: retrieve more information from the sasFileProperties, like data type and length.
SasFileProperties sasFileProperties = sasFileReader.getSasFileProperties();
for (Column column : sasFileReader.getColumns()) {
fieldInfos.add(new FieldInfo(column.getName()));
}

for (int i = 0; i < sasFileProperties.getRowCount(); i++) {
Object[] row = sasFileReader.readNext();

if (row.length == fieldInfos.size()) { // Else there appears to be a formatting error, so skip
for (int j = 0; j < row.length; j++) {
fieldInfos.get(j).processValue(row[j] == null ? "" : row[j].toString());
}
}
if (sampleSize != -1 && i == sampleSize)
MaximMoinat marked this conversation as resolved.
Show resolved Hide resolved
break;
}
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}

for (FieldInfo fieldInfo : fieldInfos) {
fieldInfo.trim();
}

return fieldInfos;
}

private class FieldInfo {
public String type;
public String name;
public CountingSet<String> valueCounts = new CountingSet<String>();
public CountingSet<String> valueCounts = new CountingSet<>();
public long sumLength = 0;
public int maxLength = 0;
public long nProcessed = 0;
Expand Down