src/main/java/datasources/FlexibleRowDataSource.java

package datasources;

import datasources.utils.DBClientWrapper;
import datasources.utils.DBTableReader;
import edb.common.UnknownTableException;
import org.apache.log4j.Logger;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.sources.v2.DataSourceOptions;
import org.apache.spark.sql.sources.v2.DataSourceV2;
import org.apache.spark.sql.sources.v2.ReadSupport;
import org.apache.spark.sql.sources.v2.reader.DataReader;
import org.apache.spark.sql.sources.v2.reader.DataReaderFactory;
import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
import org.apache.spark.sql.types.StructType;

import java.io.IOException;
import java.util.List;

/**
 * Another simple DataSource that supports sequential reads (i.e.: on just one executor)
 * from the ExampleDB. It gets a table name from its configuration and infers a schema from
 * that table.
 */
public class FlexibleRowDataSource implements DataSourceV2, ReadSupport {

    static Logger log = Logger.getLogger(FlexibleRowDataSource.class.getName());

    /**
     * Spark calls this to create the reader. Notice how it pulls the host and port
     * on which ExampleDB is listening, as well as a table name, from the supplied options.
     * @param options
     * @return
     */
    @Override
    public DataSourceReader createReader(DataSourceOptions options) {
        String host = options.get("host").orElse("localhost");
        int port = options.getInt("port", -1);
        String table = options.get("table").orElse("unknownTable"); // TODO: throw
        return new Reader(host, port, table);
    }

    /**
     * This is how Spark discovers the source table's schema by requesting a schema from ExmapleDB,
     * and how it obtains the reader factories to be used by the executors to create readers.
     * In this case only one reader factory is created, supporting just one executor, so the
     * resulting Dataset will have only a single partition -- that's why this DataSource
     * only provides sequential reads.
     */
    static class Reader implements DataSourceReader {

        static Logger log = Logger.getLogger(Reader.class.getName());

        public Reader(String host, int port, String table) {
            _host = host;
            _port = port;
            _table = table;
        }

        private StructType _schema;
        private String _host;
        private int _port;
        private String _table;

        @Override
        public StructType readSchema() {
            if (_schema == null) {
                DBClientWrapper db = new DBClientWrapper(_host, _port);
                db.connect();
                try {
                    _schema = db.getSparkSchema(_table);
                } catch (UnknownTableException ute) {
                    throw new RuntimeException(ute);
                } finally {
                    db.disconnect();
                }
            }
            return _schema;
        }

        @Override
        public List<DataReaderFactory<Row>> createDataReaderFactories() {
            log.info("creating a single factory");
            return java.util.Collections.singletonList(
                    new SimpleDataReaderFactory(_host, _port, _table, readSchema()));
        }
    }

    /**
     * This is used by a single executor to read from ExampleDB. Notice how it
     * reads all of the data since it knows that only one instance will exist at a time --
     * again because this DataSource only supports sequential data access.
     * Also note that when DBClientWrapper's getTableReader() method is called
     * it reads ALL the data in the table eagerly.
     */
    static class TaskDataReader implements DataReader<Row> {

        static Logger log = Logger.getLogger(TaskDataReader.class.getName());

        public TaskDataReader(String host, int port, String table, StructType schema)
                throws UnknownTableException {
            log.info("Task reading from [" + host + ":" + port + "]" );
            _db = new DBClientWrapper(host, port);
            _db.connect();
            _reader = _db.getTableReader(table, schema.fieldNames());
        }

        private DBClientWrapper _db;

        private DBTableReader _reader;

        @Override
        public boolean next() {
            return _reader.next();
        }

        @Override
        public Row get() {
            return _reader.get();
        }

        @Override
        public void close() throws IOException {
            _db.disconnect();
        }
    }

    /**
     * Note that this has to be serializable. Each instance is sent to an executor,
     * which uses it to create a reader for its own use.
     */
    static class SimpleDataReaderFactory implements DataReaderFactory<Row> {

        static Logger log = Logger.getLogger(SimpleDataReaderFactory.class.getName());

        public SimpleDataReaderFactory(String host, int port,
                                       String table, StructType schema) {
            _host = host;
            _port = port;
            _table = table;
            _schema = schema;
        }

        private String _host;
        private int _port;
        private String _table;
        private StructType _schema;

        @Override
        public DataReader<Row> createDataReader() {
            log.info("Factory creating reader for [" + _host + ":" + _port + "]" );
            try {
                return new TaskDataReader(_host, _port, _table, _schema);
            } catch (UnknownTableException ute) {
                throw new RuntimeException(ute);
            }
        }

    }


}