Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Issue #160]: compact from and to arbitrary storage, including tail files. #162

Merged
merged 1 commit into from
Feb 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -298,5 +298,22 @@ Connect to presto-cli:
cd ~/opt/presto-server
./bin/presto --server=localhost:8080 --catalog=pixels-presto --schema tpch
```
Execute the TPC-H queries in presto-cli.

Execute the TPC-H queries in presto-cli.
### Data Compaction*
This is optional. It is only needed if we want to test the query performance on the compact layout.
In pixels-load, use the following command to compact the files in the ordered path of each table:
```bash
COMPACT -s tpch -t customer -l 1 -n no
COMPACT -s tpch -t lineitem -l 2 -n no
COMPACT -s tpch -t orders -l 4 -n no
COMPACT -s tpch -t part -l 5 -n no
COMPACT -s tpch -t partsupp -l 6 -n no
COMPACT -s tpch -t supplier -l 8 -n no
```
The tables `nation` and `region` are too small, no need to compact them.
Compaction is faster than loading.

To avoid scanning the small files in the ordered path during query execution,
create an empty bucket in S3 and change the ordered path in the metadata database
to the empty bucket.
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,16 @@ public String ensureSchemePrefix(String path) throws IOException
public List<Status> listStatus(String path) throws IOException
{
Path p = new Path(path);
File[] files = new File(p.realPath).listFiles();
File file = new File(p.realPath);
File[] files = null;
if (file.isDirectory())
{
files = file.listFiles();
}
else
{
files = new File[] {file};
}
if (files == null)
{
throw new IOException("Failed to list files in path: " + p.realPath + ".");
Expand All @@ -178,7 +187,16 @@ public Status getStatus(String path)
public List<String> listPaths(String path) throws IOException
{
Path p = new Path(path);
File[] files = new File(p.realPath).listFiles();
File file = new File(p.realPath);
File[] files = null;
if (file.isDirectory())
{
files = file.listFiles();
}
else
{
files = new File[] {file};
}
if (files == null)
{
throw new IOException("Failed to list files in path: " + p.realPath + ".");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ public static CompactLayout fromCompact(Compact compact)
return layout;
}

/**
* Build the naive row-group-first compact layout.
* @param rowGroupNumber
* @param columnNumber
* @return
*/
public static CompactLayout buildNaive(int rowGroupNumber, int columnNumber)
{
CompactLayout layout = new CompactLayout(rowGroupNumber, columnNumber);
Expand All @@ -66,6 +72,25 @@ public static CompactLayout buildNaive(int rowGroupNumber, int columnNumber)
return layout;
}

/**
* Build the pure column-first compact layout.
* @param rowGroupNumber
* @param columnNumber
* @return
*/
public static CompactLayout buildPure(int rowGroupNumber, int columnNumber)
{
CompactLayout layout = new CompactLayout(rowGroupNumber, columnNumber);
for (int j = 0; j < columnNumber; j++)
{
for (int i = 0; i < rowGroupNumber; i++)
{
layout.append(i, j);
}
}
return layout;
}

protected void append(int rowGroupId, int columnId)
{
this.indices.add(new ColumnletIndex(rowGroupId, columnId));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import io.pixelsdb.pixels.common.physical.Status;
import io.pixelsdb.pixels.common.physical.Storage;
import io.pixelsdb.pixels.common.physical.StorageFactory;
import io.pixelsdb.pixels.core.PixelsFooterCache;
import io.pixelsdb.pixels.core.PixelsReader;
import io.pixelsdb.pixels.core.PixelsReaderImpl;
import org.junit.Test;
Expand All @@ -36,17 +37,19 @@ public void test()
{
PixelsReader pixelsReader = null;
//String filePath = "hdfs://presto00:9000/pixels/testNull_pixels/201806190954180.pxl";
String filePath = "hdfs://presto00:9000/pixels/pixels/testnull_pixels/v_0_order/";
String filePath = "file:///home/hank/Downloads/20220213200323_1.compact.pxl";
try
{
Storage storage = StorageFactory.Instance().getStorage("hdfs");
Storage storage = StorageFactory.Instance().getStorage(filePath);
List<Status> fileStatuses = storage.listStatus(filePath);
int i = 0;
for (Status fileStatus : fileStatuses)
{
pixelsReader = PixelsReaderImpl.newBuilder()
.setStorage(storage)
.setPath(fileStatus.getPath())
.setEnableCache(false)
.setPixelsFooterCache(new PixelsFooterCache())
.build();
// System.out.println(pixelsReader.getRowGroupNum());
if (pixelsReader.getFooter().getRowGroupStatsList().size() != 1)
Expand All @@ -56,6 +59,7 @@ public void test()
i++;
pixelsReader.close();
}
System.out.println(i + " file(s) in total.");
}
catch (IOException e)
{
Expand Down
27 changes: 23 additions & 4 deletions pixels-load/src/main/java/io/pixelsdb/pixels/load/multi/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -467,14 +467,33 @@ public static void main(String args[])

// get input file paths
ConfigFactory configFactory = ConfigFactory.Instance();
Storage storage = StorageFactory.Instance().getStorage("hdfs"); // TODO: support other storage type.
Storage orderStorage = StorageFactory.Instance().getStorage(layout.getOrderPath());
Storage compactStorage = StorageFactory.Instance().getStorage(layout.getCompactPath());
long blockSize = Long.parseLong(configFactory.getProperty("block.size")) * 1024l * 1024l;
short replication = Short.parseShort(configFactory.getProperty("block.replication"));
List<Status> statuses = storage.listStatus(layout.getOrderPath());
List<Status> statuses = orderStorage.listStatus(layout.getOrderPath());

// compact
for (int i = 0; i + numRowGroupInBlock < statuses.size(); i+=numRowGroupInBlock)
for (int i = 0; i < statuses.size(); i+=numRowGroupInBlock)
{
if (i + numRowGroupInBlock > statuses.size())
{
/**
* Issue #160:
* Compact the tail files that can not full fill the compactLayout
* defined in the metadata.
* Note that if (i + numRowGroupInBlock == statues.size()),
* then the remaining files are not tail files.
*
* Here we set numRowGroupInBlock to the number of tail files,
* and rebuild a pure compactLayout for the tail files as the
* compactLayout in metadata does not work for the tail files.
*/
numRowGroupInBlock = statuses.size() - i;
compactLayout = CompactLayout.buildPure(
numRowGroupInBlock, compact.getNumColumn());
}

List<String> sourcePaths = new ArrayList<>();
for (int j = 0; j < numRowGroupInBlock; ++j)
{
Expand All @@ -491,7 +510,7 @@ public static void main(String args[])
PixelsCompactor.newBuilder()
.setSourcePaths(sourcePaths)
.setCompactLayout(compactLayout)
.setStorage(storage)
.setStorage(compactStorage)
.setFilePath(filePath)
.setBlockSize(blockSize)
.setReplication(replication)
Expand Down