From be0908b45827c69ff010d9263ff865310ce722a1 Mon Sep 17 00:00:00 2001 From: Robert Pang Date: Sat, 30 Mar 2019 15:39:30 -0700 Subject: [PATCH] [YSQL] #955: Handle primary key as an index Summary: Primary key of a YSQL table is now handled as a table index so that it can be used in an index scan path. The current ybc_fdw scan path becomes a simple sequential scan. Test Plan: Jenkins Reviewers: neil, neha, mihnea Reviewed By: mihnea Subscribers: yql Differential Revision: https://phabricator.dev.yugabyte.com/D6326 --- .../test/java/org/yb/pgsql/BasePgSQLTest.java | 17 + .../org/yb/pgsql/TestPgPrepareExecute.java | 9 +- .../test/java/org/yb/pgsql/TestPgSelect.java | 19 +- .../src/backend/access/index/indexam.c | 12 +- src/postgres/src/backend/access/ybc/ybcam.c | 360 ++++++++++++++---- src/postgres/src/backend/access/ybc/ybcin.c | 70 +++- .../src/backend/bootstrap/bootparse.y | 53 ++- .../src/backend/bootstrap/bootscanner.l | 3 +- src/postgres/src/backend/catalog/dependency.c | 9 +- src/postgres/src/backend/catalog/index.c | 6 +- src/postgres/src/backend/catalog/yb_genbki.pl | 3 +- src/postgres/src/backend/commands/ybccmds.c | 8 +- .../src/backend/executor/nodeIndexonlyscan.c | 10 +- .../src/backend/executor/nodeModifyTable.c | 16 +- src/postgres/src/backend/executor/ybcScan.c | 90 ++++- src/postgres/src/backend/executor/ybc_fdw.c | 359 +---------------- .../src/backend/parser/parse_utilcmd.c | 4 +- .../src/backend/utils/cache/syscache.c | 59 --- src/postgres/src/include/access/itup.h | 1 + src/postgres/src/include/access/relscan.h | 4 +- src/postgres/src/include/access/ybcam.h | 31 +- src/postgres/src/include/commands/ybccmds.h | 10 +- src/postgres/src/include/executor/ybcScan.h | 5 +- src/postgres/src/include/pg_yb_utils.h | 37 +- src/postgres/src/include/utils/syscache.h | 1 - .../test/regress/expected/yb_create_index.out | 200 ++++++++-- .../regress/expected/yb_feature_select.out | 6 +- .../src/test/regress/sql/yb_create_index.sql | 52 ++- .../test/regress/sql/yb_feature_select.sql | 6 +- 29 files changed, 850 insertions(+), 610 deletions(-) diff --git a/java/yb-pgsql/src/test/java/org/yb/pgsql/BasePgSQLTest.java b/java/yb-pgsql/src/test/java/org/yb/pgsql/BasePgSQLTest.java index e109622c9cfc..5403e096a8db 100644 --- a/java/yb-pgsql/src/test/java/org/yb/pgsql/BasePgSQLTest.java +++ b/java/yb-pgsql/src/test/java/org/yb/pgsql/BasePgSQLTest.java @@ -579,6 +579,23 @@ protected void assertRowSet(String stmt, Set expectedRows) throws SQLExcept } } + /* + * Returns whether or not this select statement uses index. + */ + protected boolean useIndex(String stmt, String index) throws SQLException { + try (Statement statement = connection.createStatement()) { + try (ResultSet rs = statement.executeQuery("EXPLAIN " + stmt)) { + assert(rs.getMetaData().getColumnCount() == 1); // Expecting one string column. + while (rs.next()) { + if (rs.getString(1).contains("Index Scan using " + index)) { + return true; + } + } + return false; + } + } + } + /* * Returns whether or not this select statement requires filtering by Postgres (i.e. not all * conditions can be pushed down to YugaByte). diff --git a/java/yb-pgsql/src/test/java/org/yb/pgsql/TestPgPrepareExecute.java b/java/yb-pgsql/src/test/java/org/yb/pgsql/TestPgPrepareExecute.java index 7c08756ac122..3b4017c35399 100644 --- a/java/yb-pgsql/src/test/java/org/yb/pgsql/TestPgPrepareExecute.java +++ b/java/yb-pgsql/src/test/java/org/yb/pgsql/TestPgPrepareExecute.java @@ -106,23 +106,24 @@ public void testJdbcPrepareExecute() throws Exception { } // Test bind variable pushdown: - // Equality on hash key -- expect no postgres filtering, everything should be pushed to YB. + // Equality on hash key -- expect index is used with index condition. String query = "EXPLAIN SELECT * FROM test WHERE h = ?"; try (PreparedStatement sel = connection.prepareStatement(query)) { sel.setLong(1, 2); ResultSet rs = sel.executeQuery(); List rows = getRowList(rs); - assertFalse(rows.toString().contains("Filter: ")); + assertTrue(rows.toString().contains("Index Cond: ")); } // Test bind variable pushdown: - // Inequality on hash key -- expect postgres filtering. + // Inequality on hash key -- expect index is used also with index condition. We do not support + // hash inequality in DocDB yet and the filtering is done inside the YB's index access method. query = "EXPLAIN SELECT * FROM test WHERE h > ?"; try (PreparedStatement sel = connection.prepareStatement(query)) { sel.setLong(1, 2); ResultSet rs = sel.executeQuery(); List rows = getRowList(rs); - assertTrue(rows.toString().contains("Filter: ")); + assertTrue(rows.toString().contains("Index Cond: ")); } } diff --git a/java/yb-pgsql/src/test/java/org/yb/pgsql/TestPgSelect.java b/java/yb-pgsql/src/test/java/org/yb/pgsql/TestPgSelect.java index 802d1b78d5ce..07b65e5c0178 100644 --- a/java/yb-pgsql/src/test/java/org/yb/pgsql/TestPgSelect.java +++ b/java/yb-pgsql/src/test/java/org/yb/pgsql/TestPgSelect.java @@ -35,13 +35,14 @@ public class TestPgSelect extends BasePgSQLTest { @Test public void testWhereClause() throws Exception { List allRows = setupSimpleTable("test_where"); + final String PRIMARY_KEY = "test_where_pkey"; try (Statement statement = connection.createStatement()) { // Test no where clause -- select all rows. String query = "SELECT * FROM test_where"; try (ResultSet rs = statement.executeQuery(query)) { assertEquals(allRows, getSortedRowList(rs)); } - assertFalse(needsPgFiltering(query)); + assertFalse(useIndex(query, PRIMARY_KEY)); // Test fixed hash key. query = "SELECT * FROM test_where WHERE h = 2"; @@ -52,7 +53,7 @@ public void testWhereClause() throws Exception { assertEquals(10, expectedRows.size()); assertEquals(expectedRows, getSortedRowList(rs)); } - assertFalse(needsPgFiltering(query)); + assertTrue(useIndex(query, PRIMARY_KEY)); // Test fixed primary key. query = "SELECT * FROM test_where WHERE h = 2 AND r = 3.5"; @@ -64,7 +65,7 @@ public void testWhereClause() throws Exception { assertEquals(1, expectedRows.size()); assertEquals(expectedRows, getSortedRowList(rs)); } - assertFalse(needsPgFiltering(query)); + assertTrue(useIndex(query, PRIMARY_KEY)); // Test fixed range key without fixed hash key. query = "SELECT * FROM test_where WHERE r = 6.5"; @@ -75,7 +76,7 @@ public void testWhereClause() throws Exception { assertEquals(10, expectedRows.size()); assertEquals(expectedRows, getSortedRowList(rs)); } - assertTrue(needsPgFiltering(query)); + assertTrue(useIndex(query, PRIMARY_KEY)); // Test range scan. query = "SELECT * FROM test_where WHERE h = 2 AND r >= 3.5 AND r < 8.5"; @@ -88,7 +89,7 @@ public void testWhereClause() throws Exception { assertEquals(5, expectedRows.size()); assertEquals(expectedRows, getSortedRowList(rs)); } - assertTrue(needsPgFiltering(query)); + assertTrue(useIndex(query, PRIMARY_KEY)); // Test conditions on regular (non-primary-key) columns. query = "SELECT * FROM test_where WHERE vi < 14 AND vs != 'v09'"; @@ -101,10 +102,10 @@ public void testWhereClause() throws Exception { assertEquals(13, expectedRows.size()); assertEquals(expectedRows, getSortedRowList(rs)); } - assertTrue(needsPgFiltering(query)); + assertFalse(useIndex(query, PRIMARY_KEY)); // Test other WHERE operators (IN, OR, LIKE). - query = "SELECT * FROM test_where WHERE h IN (2,3) OR vs LIKE 'v_2'"; + query = "SELECT * FROM test_where WHERE h = 2 OR h = 3 OR vs LIKE 'v_2'"; try (ResultSet rs = statement.executeQuery(query)) { List expectedRows = allRows.stream() .filter(row -> row.getLong(0).equals(2L) || @@ -115,7 +116,7 @@ public void testWhereClause() throws Exception { assertEquals(28, expectedRows.size()); assertEquals(expectedRows, getSortedRowList(rs)); } - assertTrue(needsPgFiltering(query)); + assertFalse(useIndex(query, PRIMARY_KEY)); } } @@ -228,7 +229,7 @@ public void testJoins() throws Exception { // Test join with WHERE clause. joinStmt = "SELECT a.h, a.r, a.v as av, b.v as bv FROM t1 a LEFT JOIN t2 b " + - "ON (a.h = b.h and a.r = b.r) WHERE a.h = 1 AND a.r IN (2.5, 3.5)"; + "ON (a.h = b.h and a.r = b.r) WHERE a.h = 1 AND (a.r = 2.5 OR a.r = 3.5)"; try (ResultSet rs = statement.executeQuery(joinStmt)) { assertNextRow(rs, 1L, 2.5D, "abc", "foo"); assertNextRow(rs, 1L, 3.5D, "def", null); diff --git a/src/postgres/src/backend/access/index/indexam.c b/src/postgres/src/backend/access/index/indexam.c index d7c86dbed39f..b170af6defd3 100644 --- a/src/postgres/src/backend/access/index/indexam.c +++ b/src/postgres/src/backend/access/index/indexam.c @@ -633,9 +633,19 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) HeapTuple index_fetch_heap(IndexScanDesc scan) { - /* For YugaByte index, we need to select from the base table using ybctid */ + /* + * For YugaByte secondary indexes, we need to select from the base table using + * ybctid. For primary keys, the row is already prepared in "xs_hitup" that can + * be returned directly. + */ if (IsYugaByteEnabled()) { + if (scan->indexRelation->rd_index->indisprimary) + { + Assert(scan->xs_hitup != 0); + return scan->xs_hitup; + } + return YBCFetchTuple(scan->heapRelation, scan->xs_ctup.t_ybctid); } diff --git a/src/postgres/src/backend/access/ybc/ybcam.c b/src/postgres/src/backend/access/ybc/ybcam.c index b6e7ee568cd0..8063506cdbcf 100644 --- a/src/postgres/src/backend/access/ybc/ybcam.c +++ b/src/postgres/src/backend/access/ybc/ybcam.c @@ -49,9 +49,10 @@ #include "yb/yql/pggate/ybc_pggate.h" #include "pg_yb_utils.h" -static YbSysScanDesc +static YbScanDesc setup_ybcscan_from_scankey(Relation relation, Relation index, + bool index_cols_only, int nkeys, ScanKey key) { @@ -59,38 +60,59 @@ setup_ybcscan_from_scankey(Relation relation, TupleDesc where_tupdesc = NULL; List *where_cond = NIL; List *target_attrs = NIL; - AttrNumber index_attno[CATCACHE_MAXKEYS]; + AttrNumber sk_attno[INDEX_MAX_KEYS * 2]; /* A pair of lower/upper bounds per column max */ + + if (nkeys > INDEX_MAX_KEYS * 2) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("cannot use more than %d predicates in a table or index scan", + INDEX_MAX_KEYS * 2))); - /* - * If the scan uses an index, change attribute numbers to be index column numbers. - */ if (index) { - /* - * Scan with index is only used in sys catalog cache currently. Make sure the - * number of scan keys does not exceed the allocated size. - */ - Assert(nkeys <= CATCACHE_MAXKEYS); - - int i, j; - - for (i = 0; i < nkeys; i++) + if (index->rd_index->indisprimary) + { + /* + * If scanning a table using the primary key, change the attribute numbers in the + * scan key to the table's column numbers. + */ + for (int i = 0; i < nkeys; i++) + { + sk_attno[i] = index->rd_index->indkey.values[key[i].sk_attno - 1]; + } + where_tupdesc = RelationGetDescr(relation); + } + else { - for (j = 0; j < index->rd_index->indnatts; j++) + /* + * If scanning a table using a local index in the same scan, change the attribute + * numbers in the scan key to the index's column numbers. + */ + int i, j; + + for (i = 0; i < nkeys; i++) { - if (key[i].sk_attno == index->rd_index->indkey.values[j]) + for (j = 0; j < index->rd_index->indnatts; j++) { - index_attno[i] = j + 1; - break; + if (key[i].sk_attno == index->rd_index->indkey.values[j]) + { + sk_attno[i] = j + 1; + break; + } } + if (j == index->rd_index->indnatts) + elog(ERROR, "column is not in index"); } - if (j == index->rd_index->indnatts) - elog(ERROR, "column is not in index"); + where_tupdesc = RelationGetDescr(index); } - where_tupdesc = RelationGetDescr(index); } else { + /* For regular table / index fetch, just copy the scan attribute numbers. */ + for (int i = 0; i < nkeys; i++) + { + sk_attno[i] = key[i].sk_attno; + } where_tupdesc = RelationGetDescr(relation); } @@ -99,9 +121,7 @@ setup_ybcscan_from_scankey(Relation relation, */ for (int i = 0; i < nkeys; i++) { - AttrNumber sk_attno = index ? index_attno[i] : key[i].sk_attno; - - if (sk_attno == InvalidOid) + if (sk_attno[i] == InvalidOid) break; /* @@ -109,10 +129,7 @@ setup_ybcscan_from_scankey(Relation relation, * SK_SEARCHNOTNULL. */ if (key[i].sk_flags != 0) - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("WHERE condition option %d not supported yet", key[i].sk_flags), - errdetail("The WHERE condition option is not supported yet."), - errhint("Rewrite the condition differently."))); + continue; OpExpr *cond = makeNode(OpExpr); /* @@ -131,7 +148,7 @@ setup_ybcscan_from_scankey(Relation relation, /* Set up column (lhs) */ Var *lhs = makeNode(Var); - lhs->varattno = sk_attno; + lhs->varattno = sk_attno[i]; if (lhs->varattno > 0) { /* Get the type from the description */ @@ -154,7 +171,8 @@ setup_ybcscan_from_scankey(Relation relation, } /* - * Set up the scan targets, for catalog tables always all "real" columns. + * Set up the scan targets. If the table is indexed and only the indexed columns should be + * returned, fetch just those columns. Otherwise, fetch all "real" columns. */ if (relation->rd_rel->relhasoids) { @@ -162,11 +180,23 @@ setup_ybcscan_from_scankey(Relation relation, target->resno = ObjectIdAttributeNumber; target_attrs = lappend(target_attrs, target); } - for (AttrNumber attnum = 1; attnum <= tupdesc->natts; attnum++) + if (index != NULL && index_cols_only) { - TargetEntry *target = makeNode(TargetEntry); - target->resno = attnum; - target_attrs = lappend(target_attrs, target); + for (int i = 0; i < index->rd_index->indnatts; i++) + { + TargetEntry *target = makeNode(TargetEntry); + target->resno = index->rd_index->indkey.values[i]; + target_attrs = lappend(target_attrs, target); + } + } + else + { + for (AttrNumber attnum = 1; attnum <= tupdesc->natts; attnum++) + { + TargetEntry *target = makeNode(TargetEntry); + target->resno = attnum; + target_attrs = lappend(target_attrs, target); + } } /* @@ -178,27 +208,125 @@ setup_ybcscan_from_scankey(Relation relation, : YBTupleIdAttributeNumber; target_attrs = lappend(target_attrs, target); - /* Set up YugaByte system table description */ - YbSysScanDesc ybScan = (YbSysScanDesc) palloc0(sizeof(YbSysScanDescData)); - ybScan->state = ybcBeginScan(relation, index, target_attrs, where_cond); + /* Set up YugaByte scan description. Scan with an index if it is a secondary index */ + bool indisprimary = (index && index->rd_index->indisprimary); + bool indissecondary = (index && !index->rd_index->indisprimary); + YbScanDesc ybScan = (YbScanDesc) palloc0(sizeof(YbScanDescData)); + ybScan->state = ybcBeginScan(relation, indissecondary ? index : NULL, target_attrs, where_cond); ybScan->key = key; ybScan->nkeys = nkeys; + for (int i = 0; i < nkeys; i++) + { + /* If scanning with the primary key, switch the attribute numbers in the scan keys + * to the table's column numbers. + */ + ybScan->sk_attno[i] = indisprimary ? index->rd_index->indkey.values[key[i].sk_attno - 1] + : key[i].sk_attno; + } + return ybScan; } static bool -tuple_matches_key(HeapTuple tup, TupleDesc tupdesc, int nkeys, ScanKey key) +heaptuple_matches_key(HeapTuple tup, + TupleDesc tupdesc, + int nkeys, + ScanKey key, + AttrNumber sk_attno[], + bool *recheck) { + *recheck = false; + for (int i = 0; i < nkeys; i++) { - if (key[i].sk_attno == InvalidOid) - { + if (sk_attno[i] == InvalidOid) break; + + bool is_null = false; + Datum res_datum = heap_getattr(tup, sk_attno[i], tupdesc, &is_null); + + if (key[i].sk_flags & SK_SEARCHNULL) + { + if (is_null) + continue; + else + return false; + } + + if (key[i].sk_flags & SK_SEARCHNOTNULL) + { + if (!is_null) + continue; + else + return false; } + /* + * TODO: support the different search options like SK_SEARCHARRAY. + */ + if (key[i].sk_flags != 0) + { + *recheck = true; + continue; + } + + if (is_null) + return false; + + bool matches = DatumGetBool(FunctionCall2Coll(&key[i].sk_func, + key[i].sk_collation, + res_datum, + key[i].sk_argument)); + if (!matches) + return false; + } + + return true; +} + +static bool +indextuple_matches_key(IndexTuple tup, + TupleDesc tupdesc, + int nkeys, + ScanKey key, + AttrNumber sk_attno[], + bool *recheck) +{ + *recheck = false; + + for (int i = 0; i < nkeys; i++) + { + if (sk_attno[i] == InvalidOid) + break; + bool is_null = false; - Datum res_datum = heap_getattr(tup, key[i].sk_attno, tupdesc, &is_null); + Datum res_datum = index_getattr(tup, sk_attno[i], tupdesc, &is_null); + + if (key[i].sk_flags & SK_SEARCHNULL) + { + if (is_null) + continue; + else + return false; + } + + if (key[i].sk_flags & SK_SEARCHNOTNULL) + { + if (!is_null) + continue; + else + return false; + } + + /* + * TODO: support the different search options like SK_SEARCHARRAY. + */ + if (key[i].sk_flags != 0) + { + *recheck = true; + continue; + } if (is_null) return false; @@ -220,7 +348,8 @@ HeapScanDesc ybc_heap_beginscan(Relation relation, ScanKey key, bool temp_snap) { - YbSysScanDesc ybScan = setup_ybcscan_from_scankey(relation, NULL /* index */, nkeys, key); + YbScanDesc ybScan = setup_ybcscan_from_scankey(relation, NULL /* index */, + false /* index_cols_only */, nkeys, key); /* Set up Postgres sys table scan description */ HeapScanDesc scan_desc = (HeapScanDesc) palloc0(sizeof(HeapScanDescData)); @@ -243,15 +372,21 @@ SysScanDesc ybc_systable_beginscan(Relation relation, Relation index = NULL; /* - * Look up the index to scan with if we can. + * Look up the index to scan with if we can. If the index is the primary key which is part + * of the table in YugaByte, we should scan the table directly. */ - if (indexOK && !IgnoreSystemIndexes && !ReindexIsProcessingIndex(indexId) && - indexId != YBSysTablePrimaryKeyOid(RelationGetRelid(relation))) + if (indexOK && !IgnoreSystemIndexes && !ReindexIsProcessingIndex(indexId)) { index = RelationIdGetRelation(indexId); + if (index->rd_index->indisprimary) + { + RelationClose(index); + index = NULL; + } } - YbSysScanDesc ybScan = setup_ybcscan_from_scankey(relation, index, nkeys, key); + YbScanDesc ybScan = setup_ybcscan_from_scankey(relation, index, + false /* index_cols_only */, nkeys, key); /* Set up Postgres sys table scan description */ SysScanDesc scan_desc = (SysScanDesc) palloc0(sizeof(SysScanDescData)); @@ -267,37 +402,98 @@ SysScanDesc ybc_systable_beginscan(Relation relation, return scan_desc; } -void ybc_index_beginscan(Relation relation, +void ybc_pkey_beginscan(Relation relation, + Relation index, + IndexScanDesc scan_desc, + int nkeys, + ScanKey key) +{ + /* For rescan, end the previous scan. */ + if (scan_desc->opaque) + { + ybc_pkey_endscan(scan_desc); + scan_desc->opaque = NULL; + } + + /* + * In YugaByte, every table is organized by its primary key. Therefore, if we are scanning + * the primary key, look up the base table to prepare scanning it directly. + */ + Assert(index->rd_index->indisprimary); + + YbScanDesc ybScan = setup_ybcscan_from_scankey(relation, index, + scan_desc->xs_want_itup /* index_cols_only */, + nkeys, key); + ybScan->index = index; + + scan_desc->opaque = ybScan; +} + +void ybc_index_beginscan(Relation index, IndexScanDesc scan_desc, int nkeys, ScanKey key) { - /* - * For rescan, end the previous scan. - */ + /* For rescan, end the previous scan. */ if (scan_desc->opaque) { ybc_index_endscan(scan_desc); scan_desc->opaque = NULL; } - scan_desc->opaque = setup_ybcscan_from_scankey(relation, NULL /* index */, nkeys, key); + + /* + * Scan the index directly as if we are scanning a table. Passing "index_cols_only" as false + * because we are scanning the index directly, not scanning base table with an index. + */ + YbScanDesc ybScan = setup_ybcscan_from_scankey(index /* relation */, NULL /* index */, + false /* index_cols_only */, nkeys, key); + ybScan->index = index; + + scan_desc->opaque = ybScan; } -HeapTuple ybc_scan_getnext(YbScanState scan_state, - int nkeys, - ScanKey key) +static HeapTuple ybc_getnext_heaptuple(YbScanDesc ybScan, bool *recheck) { - HeapTuple ybtp = NULL; + YbScanState scan_state = ybScan->state; + int nkeys = ybScan->nkeys; + ScanKey key = ybScan->key; + AttrNumber *sk_attno = ybScan->sk_attno; + HeapTuple tup = NULL; + /* * YB Scan may not be able to push down the scan key condition so we may * need additional filtering here. */ - while (HeapTupleIsValid(ybtp = ybcFetchNext(scan_state))) + while (HeapTupleIsValid(tup = ybcFetchNextHeapTuple(scan_state))) { - if (tuple_matches_key(ybtp, scan_state->tupleDesc, nkeys, key)) - return ybtp; + if (heaptuple_matches_key(tup, scan_state->tupleDesc, nkeys, key, sk_attno, recheck)) + return tup; - heap_freetuple(ybtp); + heap_freetuple(tup); + } + + return NULL; +} + +static IndexTuple ybc_getnext_indextuple(YbScanDesc ybScan, bool *recheck) +{ + YbScanState scan_state = ybScan->state; + int nkeys = ybScan->nkeys; + ScanKey key = ybScan->key; + AttrNumber *sk_attno = ybScan->sk_attno; + Relation index = ybScan->index; + IndexTuple tup = NULL; + + /* + * YB Scan may not be able to push down the scan key condition so we may + * need additional filtering here. + */ + while (PointerIsValid(tup = ybcFetchNextIndexTuple(scan_state, index))) + { + if (indextuple_matches_key(tup, RelationGetDescr(index), nkeys, key, sk_attno, recheck)) + return tup; + + pfree(tup); } return NULL; @@ -305,36 +501,44 @@ HeapTuple ybc_scan_getnext(YbScanState scan_state, HeapTuple ybc_heap_getnext(HeapScanDesc scan_desc) { + bool recheck = false; + Assert(PointerIsValid(scan_desc->ybscan)); - YbScanState scan_state = scan_desc->ybscan->state; - int nkeys = scan_desc->ybscan->nkeys; - ScanKey key = scan_desc->ybscan->key; + HeapTuple tuple = ybc_getnext_heaptuple(scan_desc->ybscan, &recheck); + + Assert(!recheck); - return ybc_scan_getnext(scan_state, nkeys, key); + return tuple; } HeapTuple ybc_systable_getnext(SysScanDesc scan_desc) { + bool recheck = false; + Assert(PointerIsValid(scan_desc->ybscan)); - YbScanState scan_state = scan_desc->ybscan->state; - int nkeys = scan_desc->ybscan->nkeys; - ScanKey key = scan_desc->ybscan->key; + HeapTuple tuple = ybc_getnext_heaptuple(scan_desc->ybscan, &recheck); + + Assert(!recheck); - return ybc_scan_getnext(scan_state, nkeys, key); + return tuple; } -HeapTuple ybc_index_getnext(IndexScanDesc scan_desc) +HeapTuple ybc_pkey_getnext(IndexScanDesc scan_desc) { - YbSysScanDesc ybscan = (YbSysScanDesc) scan_desc->opaque; + YbScanDesc ybscan = (YbScanDesc) scan_desc->opaque; Assert(PointerIsValid(ybscan)); - YbScanState scan_state = ybscan->state; - int nkeys = ybscan->nkeys; - ScanKey key = ybscan->key; + return ybc_getnext_heaptuple(ybscan, &scan_desc->xs_recheck); +} - return ybc_scan_getnext(scan_state, nkeys, key); +IndexTuple ybc_index_getnext(IndexScanDesc scan_desc) +{ + YbScanDesc ybscan = (YbScanDesc) scan_desc->opaque; + Assert(PointerIsValid(ybscan)); + + return ybc_getnext_indextuple(ybscan, &scan_desc->xs_recheck); } void ybc_heap_endscan(HeapScanDesc scan_desc) @@ -355,9 +559,17 @@ void ybc_systable_endscan(SysScanDesc scan_desc) pfree(scan_desc); } +void ybc_pkey_endscan(IndexScanDesc scan_desc) +{ + YbScanDesc ybscan = (YbScanDesc) scan_desc->opaque; + Assert(PointerIsValid(ybscan)); + ybcEndScan(ybscan->state); + pfree(ybscan); +} + void ybc_index_endscan(IndexScanDesc scan_desc) { - YbSysScanDesc ybscan = (YbSysScanDesc) scan_desc->opaque; + YbScanDesc ybscan = (YbScanDesc) scan_desc->opaque; Assert(PointerIsValid(ybscan)); ybcEndScan(ybscan->state); pfree(ybscan); diff --git a/src/postgres/src/backend/access/ybc/ybcin.c b/src/postgres/src/backend/access/ybc/ybcin.c index 8cf94388c260..cbe4345e77ee 100644 --- a/src/postgres/src/backend/access/ybc/ybcin.c +++ b/src/postgres/src/backend/access/ybc/ybcin.c @@ -39,7 +39,8 @@ /* Working state for ybcinbuild and its callback */ typedef struct { - double index_tuples; + bool isprimary; + double index_tuples; } YBCBuildState; static void @@ -48,7 +49,9 @@ ybcinbuildCallback(Relation index, HeapTuple heapTuple, Datum *values, bool *isn { YBCBuildState *buildstate = (YBCBuildState *)state; - YBCExecuteInsertIndex(index, values, isnull, heapTuple->t_ybctid); + if (!buildstate->isprimary) + YBCExecuteInsertIndex(index, values, isnull, heapTuple->t_ybctid); + buildstate->index_tuples += 1; } @@ -58,8 +61,6 @@ ybcinbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) YBCBuildState buildstate; double heap_tuples = 0; - Assert(!index->rd_index->indisprimary); - PG_TRY(); { /* Buffer the inserts into the index for initdb */ @@ -67,6 +68,7 @@ ybcinbuild(Relation heap, Relation index, struct IndexInfo *indexInfo) YBCStartBufferingWriteOperations(); /* Do the heap scan */ + buildstate.isprimary = index->rd_index->indisprimary; buildstate.index_tuples = 0; heap_tuples = IndexBuildHeapScan(heap, index, indexInfo, true, ybcinbuildCallback, &buildstate, NULL); @@ -101,9 +103,8 @@ bool ybcininsert(Relation index, Datum *values, bool *isnull, Datum ybctid, Relation heap, IndexUniqueCheck checkUnique, struct IndexInfo *indexInfo) { - Assert(!index->rd_index->indisprimary); - - YBCExecuteInsertIndex(index, values, isnull, ybctid); + if (!index->rd_index->indisprimary) + YBCExecuteInsertIndex(index, values, isnull, ybctid); return index->rd_index->indisunique ? true : false; } @@ -112,7 +113,8 @@ void ybcindelete(Relation index, Datum *values, bool *isnull, Datum ybctid, Relation heap, struct IndexInfo *indexInfo) { - YBCExecuteDeleteIndex(index, values, isnull, ybctid); + if (!index->rd_index->indisprimary) + YBCExecuteDeleteIndex(index, values, isnull, ybctid); } IndexBulkDeleteResult * @@ -134,7 +136,15 @@ ybcinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) bool ybcincanreturn(Relation index, int attno) { - return false; + /* + * If "canreturn" is true, Postgres will attempt to perform index-only scan on the indexed + * columns and expect us to return the column values as an IndexTuple. This will be the case + * for secondary index. + * + * For indexes which are primary keys, we will return the table row as a HeapTuple instead. + * For this reason, we set "canreturn" to false for primary keys. + */ + return !index->rd_index->indisprimary; } void @@ -183,20 +193,52 @@ ybcinbeginscan(Relation rel, int nkeys, int norderbys) void ybcinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys) { - ybc_index_beginscan(scan->indexRelation, scan, nscankeys, scankey); + if (scan->indexRelation->rd_index->indisprimary) + ybc_pkey_beginscan(scan->heapRelation, scan->indexRelation, scan, nscankeys, scankey); + else + ybc_index_beginscan(scan->indexRelation, scan, nscankeys, scankey); } bool ybcingettuple(IndexScanDesc scan, ScanDirection dir) { - HeapTuple tuple = ybc_index_getnext(scan); - scan->xs_ctup.t_ybctid = (tuple != NULL) ? tuple->t_ybctid : 0; - scan->xs_recheck = false; /* no need to recheck because the scan key is exact match */ + scan->xs_ctup.t_ybctid = 0; + + /* + * If IndexTuple is requested or it is a secondary index, return the result as IndexTuple. + * Otherwise, return the result as a HeapTuple of the base table. + */ + if (scan->xs_want_itup || !scan->indexRelation->rd_index->indisprimary) + { + IndexTuple tuple = ybc_index_getnext(scan); + + if (tuple) + { + scan->xs_ctup.t_ybctid = tuple->t_ybctid; + scan->xs_itup = tuple; + scan->xs_itupdesc = RelationGetDescr(scan->indexRelation); + } + } + else + { + HeapTuple tuple = ybc_pkey_getnext(scan); + + if (tuple) + { + scan->xs_ctup.t_ybctid = tuple->t_ybctid; + scan->xs_hitup = tuple; + scan->xs_hitupdesc = RelationGetDescr(scan->heapRelation); + } + } + return scan->xs_ctup.t_ybctid != 0; } void ybcinendscan(IndexScanDesc scan) { - ybc_index_endscan(scan); + if (scan->indexRelation->rd_index->indisprimary) + ybc_pkey_endscan(scan); + else + ybc_index_endscan(scan); } diff --git a/src/postgres/src/backend/bootstrap/bootparse.y b/src/postgres/src/backend/bootstrap/bootparse.y index 4db16e74265f..9a466302ea08 100644 --- a/src/postgres/src/backend/bootstrap/bootparse.y +++ b/src/postgres/src/backend/bootstrap/bootparse.y @@ -129,7 +129,7 @@ static int num_columns_read = 0; %token NULLVAL /* All the rest are unreserved, and should be handled in boot_ident! */ %token OPEN XCLOSE XCREATE INSERT_TUPLE -%token XDECLARE YBDECLARE INDEX ON USING XBUILD INDICES UNIQUE XTOAST +%token XDECLARE YBDECLARE INDEX ON USING XBUILD INDICES PRIMARY UNIQUE XTOAST %token OBJ_ID XBOOTSTRAP XSHARED_RELATION XWITHOUT_OIDS XROWTYPE_OID %token XFORCE XNOT XNULL @@ -154,6 +154,7 @@ Boot_Query : | Boot_InsertStmt | Boot_DeclareIndexStmt | Boot_DeclareUniqueIndexStmt + | Boot_DeclarePrimaryIndexStmt | Boot_DeclareToastStmt | Boot_BuildIndsStmt ; @@ -187,7 +188,7 @@ Boot_CloseStmt: ; Boot_YBIndex: /* EMPTY */ { $$ = NULL; } - | YBDECLARE UNIQUE INDEX boot_ident oidspec ON boot_ident USING boot_ident + | YBDECLARE PRIMARY INDEX boot_ident oidspec ON boot_ident USING boot_ident LPAREN boot_index_params RPAREN { IndexStmt *stmt = makeNode(IndexStmt); @@ -206,7 +207,7 @@ Boot_YBIndex: stmt->indexOid = $5; stmt->oldNode = InvalidOid; stmt->unique = true; - stmt->primary = false; + stmt->primary = true; stmt->isconstraint = false; stmt->deferrable = false; stmt->initdeferred = false; @@ -450,6 +451,52 @@ Boot_DeclareUniqueIndexStmt: } ; +Boot_DeclarePrimaryIndexStmt: + XDECLARE PRIMARY INDEX boot_ident oidspec ON boot_ident USING boot_ident LPAREN boot_index_params RPAREN + { + IndexStmt *stmt = makeNode(IndexStmt); + Oid relationId; + + do_start(); + + stmt->idxname = $4; + stmt->relation = makeRangeVar(NULL, $7, -1); + stmt->accessMethod = $9; + stmt->tableSpace = NULL; + stmt->indexParams = $11; + stmt->options = NIL; + stmt->whereClause = NULL; + stmt->excludeOpNames = NIL; + stmt->idxcomment = NULL; + stmt->indexOid = InvalidOid; + stmt->oldNode = InvalidOid; + stmt->unique = true; + stmt->primary = true; + stmt->isconstraint = false; + stmt->deferrable = false; + stmt->initdeferred = false; + stmt->transformed = false; + stmt->concurrent = false; + stmt->if_not_exists = false; + + /* locks and races need not concern us in bootstrap mode */ + relationId = RangeVarGetRelid(stmt->relation, NoLock, + false); + + DefineIndex(relationId, + stmt, + $5, + InvalidOid, + InvalidOid, + false, + false, + false, + true, /* skip_build */ + false); + do_end(); + } + ; + Boot_DeclareToastStmt: XDECLARE XTOAST oidspec oidspec ON boot_ident { diff --git a/src/postgres/src/backend/bootstrap/bootscanner.l b/src/postgres/src/backend/bootstrap/bootscanner.l index ca55db2e1f91..9808e3a1b128 100644 --- a/src/postgres/src/backend/bootstrap/bootscanner.l +++ b/src/postgres/src/backend/bootstrap/bootscanner.l @@ -109,10 +109,11 @@ _null_ { return NULLVAL; } ^\#[^\n]* ; /* drop everything after "#" for comments */ declare { yylval.kw = "declare"; return XDECLARE; } -yb_declare { yylval.kw = "yb_declare"; return YBDECLARE; } /* For YugaByte indexes/pkeys */ +yb_declare { yylval.kw = "yb_declare"; return YBDECLARE; } /* For YugaByte pkeys */ build { yylval.kw = "build"; return XBUILD; } indices { yylval.kw = "indices"; return INDICES; } unique { yylval.kw = "unique"; return UNIQUE; } +primary { yylval.kw = "primary"; return PRIMARY; } /* For YugaByte pkeys */ index { yylval.kw = "index"; return INDEX; } on { yylval.kw = "on"; return ON; } using { yylval.kw = "using"; return USING; } diff --git a/src/postgres/src/backend/catalog/dependency.c b/src/postgres/src/backend/catalog/dependency.c index ff66ae14ecb7..116fe16dc87d 100644 --- a/src/postgres/src/backend/catalog/dependency.c +++ b/src/postgres/src/backend/catalog/dependency.c @@ -1131,7 +1131,14 @@ doDeletion(const ObjectAddress *object, int flags) Assert(object->objectSubId == 0); if (IsYugaByteEnabled() && IsYBRelationByKind(relKind)) - YBCDropIndex(object->objectId); + { + Relation index = RelationIdGetRelation(object->objectId); + + if (!index->rd_index->indisprimary) + YBCDropIndex(object->objectId); + + RelationClose(index); + } index_drop(object->objectId, concurrent); } else diff --git a/src/postgres/src/backend/catalog/index.c b/src/postgres/src/backend/catalog/index.c index d9139a8338c0..b93e16e80ea8 100644 --- a/src/postgres/src/backend/catalog/index.c +++ b/src/postgres/src/backend/catalog/index.c @@ -949,7 +949,11 @@ index_create(Relation heapRelation, Assert(indexRelationId == RelationGetRelid(indexRelation)); - if (IsYugaByteEnabled()) + /* + * Create index in YugaByte only if it is a secondary index. Primary key is + * an implicit part of the base table in YugaByte and doesn't need to be created. + */ + if (IsYugaByteEnabled() && !isprimary) { YBCCreateIndex(indexRelationName, indexInfo, diff --git a/src/postgres/src/backend/catalog/yb_genbki.pl b/src/postgres/src/backend/catalog/yb_genbki.pl index cc376b7db2ee..b8c94333e7fb 100644 --- a/src/postgres/src/backend/catalog/yb_genbki.pl +++ b/src/postgres/src/backend/catalog/yb_genbki.pl @@ -348,6 +348,7 @@ } if ($pkidx) { + $pkidx =~ s/unique index/primary index/; print $bki " yb_" . $pkidx; $pkidxs{$pkidxname} = 1; } @@ -458,7 +459,7 @@ { my ($unique, $idxname, $oid, $icatname, $columns) = /declare (unique )?index (.*) (\d+) on (.+) using (.+)/; - next if $idxname && $pkidxs{$idxname}; + s/unique index/primary index/ if $idxname && $pkidxs{$idxname}; print $bki $_; } diff --git a/src/postgres/src/backend/commands/ybccmds.c b/src/postgres/src/backend/commands/ybccmds.c index 4e089ac5c47b..680f077a0c01 100644 --- a/src/postgres/src/backend/commands/ybccmds.c +++ b/src/postgres/src/backend/commands/ybccmds.c @@ -320,7 +320,9 @@ YBCCreateIndex(const char *indexName, HandleYBStatus(YBCPgDeleteStatement(handle)); } -void YBCAlterTable(AlterTableStmt *stmt, Relation rel, Oid relationId) { +void +YBCAlterTable(AlterTableStmt *stmt, Relation rel, Oid relationId) +{ YBCPgStatement handle = NULL; HandleYBStatus(YBCPgNewAlterTable(ybc_pg_session, MyDatabaseId, @@ -383,7 +385,9 @@ void YBCAlterTable(AlterTableStmt *stmt, Relation rel, Oid relationId) { } } -void YBCRename(RenameStmt *stmt, Oid relationId) { +void +YBCRename(RenameStmt *stmt, Oid relationId) +{ YBCPgStatement handle = NULL; char *db_name = get_database_name(MyDatabaseId); diff --git a/src/postgres/src/backend/executor/nodeIndexonlyscan.c b/src/postgres/src/backend/executor/nodeIndexonlyscan.c index 8c32a74d39ea..6ca791a75bcb 100644 --- a/src/postgres/src/backend/executor/nodeIndexonlyscan.c +++ b/src/postgres/src/backend/executor/nodeIndexonlyscan.c @@ -41,6 +41,7 @@ #include "utils/memutils.h" #include "utils/rel.h" +#include "pg_yb_utils.h" static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node); static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, @@ -154,8 +155,11 @@ IndexOnlyNext(IndexOnlyScanState *node) * * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. + * + * YugaByte index tuple is always visible. */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, + if (!IsYugaByteEnabled() && + !VM_ALL_VISIBLE(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), &node->ioss_VMBuffer)) { @@ -241,8 +245,10 @@ IndexOnlyNext(IndexOnlyScanState *node) * locks need the tuple's xmin value. If we had to visit the tuple * anyway, then we already have the tuple-level lock and can skip the * page lock. + * + * YugaByte index tuple does not require locking. */ - if (tuple == NULL) + if (tuple == NULL && !IsYugaByteEnabled()) PredicateLockPage(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), estate->es_snapshot); diff --git a/src/postgres/src/backend/executor/nodeModifyTable.c b/src/postgres/src/backend/executor/nodeModifyTable.c index e0b47dd62398..7be50c90f498 100644 --- a/src/postgres/src/backend/executor/nodeModifyTable.c +++ b/src/postgres/src/backend/executor/nodeModifyTable.c @@ -956,10 +956,13 @@ ldelete:; } else if (IsYugaByteEnabled()) { + /* + * Prepare the original tuple in inner slot for RETURNING clause execution. + */ if (!IsYBRelation(resultRelationDesc)) { ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_OBJECT), - errmsg("This relational object does not exist in YugaByte database"))); + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("This relational object does not exist in YugaByte database"))); } slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, planSlot); delbuffer = InvalidBuffer; @@ -1114,6 +1117,12 @@ ExecUpdate(ModifyTableState *mtstate, { YBCExecuteUpdate(resultRelationDesc, planSlot, tuple); + /* + * Prepare the updated tuple in inner slot for RETURNING clause execution. + */ + if (resultRelInfo->ri_projectReturning) + slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, planSlot); + if (resultRelInfo->ri_NumIndices > 0) { /* @@ -1129,9 +1138,6 @@ ExecUpdate(ModifyTableState *mtstate, estate, false, NULL, NIL); } - - if (resultRelInfo->ri_projectReturning) - slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, planSlot); } else { diff --git a/src/postgres/src/backend/executor/ybcScan.c b/src/postgres/src/backend/executor/ybcScan.c index ba7e5dae925e..472781077ebb 100644 --- a/src/postgres/src/backend/executor/ybcScan.c +++ b/src/postgres/src/backend/executor/ybcScan.c @@ -21,7 +21,7 @@ * * This is meant to a be a common api betwen regular scan path (ybc_fdw.c) * and syscatalog scan path (ybcam.c). - * TODO currently this is only used by syscatalog scan path, ybc_fdw needs to + * TODO currently this is only used by sys catalog and index scan paths, ybc_fdw needs to * be refactored. */ @@ -45,7 +45,7 @@ #include "pg_yb_utils.h" -void ybcFreeScanState(YbScanState ybc_state) +static void ybcFreeScanState(YbScanState ybc_state) { /* If yb_fdw_exec_state is NULL, we are in EXPLAIN; nothing to do */ if (ybc_state != NULL && ybc_state->handle != NULL) @@ -231,11 +231,21 @@ static void ybcAddWhereCond(Expr *expr, YBCPgStatement yb_stmt, bool useIndex) } } +/* + * Begin a scan of a table or index. When "rel" is a table, an optional local "index" may be given, + * in which case we will scan using that index co-located with the table (which currently applies + * to sys catalog table in yb-master only) in the same scan. The attribute numbers in "yb_conds" + * will be those of the index whereas the ones in "target_attrs" will be those of the table. + * + * Alternatively, when scanning an index directly, "rel" points to the index itself and "index" + * will be NULL. The attribute numbers in "target_attrs" and "yb_conds" will be those of the index. + */ YbScanState ybcBeginScan(Relation rel, Relation index, List *target_attrs, List *yb_conds) { Oid dboid = YBCGetDatabaseOid(rel); Oid relid = RelationGetRelid(rel); - Oid index_id = index ? RelationGetRelid(index) : InvalidOid; + bool useIndex = (index != NULL); + Oid index_id = useIndex ? RelationGetRelid(index) : InvalidOid; YbScanState ybc_state = NULL; ListCell *lc; @@ -259,7 +269,7 @@ YbScanState ybcBeginScan(Relation rel, Relation index, List *target_attrs, List ybc_state->tupleDesc = RelationGetDescr(rel); YbScanPlan ybc_plan = (YbScanPlan) palloc0(sizeof(YbScanPlanData)); - ybcLoadTableInfo(index ? index : rel, ybc_plan); + ybcLoadTableInfo(useIndex ? index : rel, ybc_plan); foreach(lc, yb_conds) { @@ -282,8 +292,7 @@ YbScanState ybcBeginScan(Relation rel, Relation index, List *target_attrs, List } else { - AttrNumber attnum; - TupleDesc tupleDesc = index ? RelationGetDescr(index) : RelationGetDescr(rel); + TupleDesc tupleDesc = RelationGetDescr(useIndex ? index : rel); /* * TODO: We scan the range columns by increasing attribute number to look for the first @@ -291,7 +300,7 @@ YbScanState ybcBeginScan(Relation rel, Relation index, List *target_attrs, List * of the primary key in YugaByte follows the same order as the attribute number. When the * bug is fixed, this scan needs to be updated. */ - for (attnum = 1; attnum <= tupleDesc->natts; attnum++) + for (AttrNumber attnum = 1; attnum <= tupleDesc->natts; attnum++) { int bms_idx = attnum - FirstLowInvalidHeapAttributeNumber; if ( bms_is_member(bms_idx, ybc_plan->primary_key) && @@ -330,7 +339,6 @@ YbScanState ybcBeginScan(Relation rel, Relation index, List *target_attrs, List } /* Set WHERE clause values (currently only primary key). */ - bool useIndex = (index != NULL); foreach(lc, ybc_plan->yb_hconds) { Expr *expr = (Expr *) lfirst(lc); @@ -391,7 +399,7 @@ YbScanState ybcBeginScan(Relation rel, Relation index, List *target_attrs, List return ybc_state; } -HeapTuple ybcFetchNext(YbScanState ybc_state) +HeapTuple ybcFetchNextHeapTuple(YbScanState ybc_state) { HeapTuple tuple = NULL; bool has_data = false; @@ -423,10 +431,68 @@ HeapTuple ybcFetchNext(YbScanState ybc_state) { tuple->t_ybctid = PointerGetDatum(syscols.ybctid); } - if (syscols.ybbasectid != NULL) + } + pfree(values); + pfree(nulls); + + return tuple; +} + +IndexTuple ybcFetchNextIndexTuple(YbScanState ybc_state, Relation index) +{ + IndexTuple tuple = NULL; + bool has_data = false; + TupleDesc tupdesc = ybc_state->tupleDesc; + + Datum *values = (Datum *) palloc0(tupdesc->natts * sizeof(Datum)); + bool *nulls = (bool *) palloc(tupdesc->natts * sizeof(bool)); + YBCPgSysColumns syscols; + + /* Fetch one row. */ + HandleYBStmtStatusWithOwner(YBCPgDmlFetch(ybc_state->handle, + tupdesc->natts, + (uint64_t *) values, + nulls, + &syscols, + &has_data), + ybc_state->handle, + ybc_state->stmt_owner); + + if (has_data) + { + /* + * Return the IndexTuple. If this is a primary key, reorder the values first as expected + * in the index's column order first. + */ + if (index->rd_index->indisprimary) + { + Assert(index->rd_index->indnatts <= INDEX_MAX_KEYS); + + Datum ivalues[INDEX_MAX_KEYS]; + bool inulls[INDEX_MAX_KEYS]; + + for (int i = 0; i < index->rd_index->indnatts; i++) + { + AttrNumber attno = index->rd_index->indkey.values[i]; + ivalues[i] = values[attno - 1]; + inulls[i] = nulls[attno - 1]; + } + + tuple = index_form_tuple(RelationGetDescr(index), ivalues, inulls); + if (syscols.ybctid != NULL) + { + tuple->t_ybctid = PointerGetDatum(syscols.ybctid); + } + } + else { - tuple->t_ybctid = PointerGetDatum(syscols.ybbasectid); + tuple = index_form_tuple(tupdesc, values, nulls); + if (syscols.ybbasectid != NULL) + { + tuple->t_ybctid = PointerGetDatum(syscols.ybbasectid); + } } + } pfree(values); pfree(nulls); @@ -434,7 +500,7 @@ HeapTuple ybcFetchNext(YbScanState ybc_state) return tuple; } -extern void ybcEndScan(YbScanState ybc_state) +void ybcEndScan(YbScanState ybc_state) { ybcFreeScanState(ybc_state); } diff --git a/src/postgres/src/backend/executor/ybc_fdw.c b/src/postgres/src/backend/executor/ybc_fdw.c index ad0dfc06cb89..7ecdb6a7d1b8 100644 --- a/src/postgres/src/backend/executor/ybc_fdw.c +++ b/src/postgres/src/backend/executor/ybc_fdw.c @@ -73,232 +73,11 @@ static const int DEFAULT_YB_NUM_ROWS = 1000; typedef struct YbFdwPlanState { - /* YugaByte metadata about the referenced table/relation. */ - Bitmapset *primary_key; - Bitmapset *hash_key; - /* Bitmap of attribute (column) numbers that we need to fetch from YB. */ Bitmapset *target_attrs; - /* (Equality) Conditions on hash key -- filtered by YugaByte */ - List *yb_hconds; - - /* (Equality) Conditions on range key -- filtered by YugaByte */ - List *yb_rconds; - - /* Rest of baserestrictinfo conditions -- filtered by Postgres */ - List *pg_conds; - - /* - * The set of columns set by YugaByte conds (i.e. in yb_hconds or yb_rconds - * above). Used to check if hash or primary key is fully set. - */ - Bitmapset *yb_cols; - } YbFdwPlanState; -static bool IsSupportedPredicateExpr(Expr *expr) -{ - switch (nodeTag(expr)) - { - case T_Const: - return true; - case T_Param: - { - Param *param = (Param *) expr; - return param->paramkind == PARAM_EXTERN; - } - case T_RelabelType: - { - /* - * RelabelType is a "dummy" type coercion between two binary- - * compatible datatypes so we just recurse into its argument. - */ - RelabelType *rt = (RelabelType *) expr; - return IsSupportedPredicateExpr(rt->arg); - } - default: - break; - } - - return false; -} - -static void GetValFromPredicateExpr(Expr *expr, - ParamListInfo params_info, - Datum *value, - bool *isnull) -{ - switch (nodeTag(expr)) - { - case T_RelabelType: - { - /* - * RelabelType is a "dummy" type coercion between two binary- - * compatible datatypes so we just recurse into its argument. - */ - RelabelType *rt = (RelabelType *) expr; - return GetValFromPredicateExpr(rt->arg, params_info, value, isnull); - } - case T_Const: - { - Const *cst = (Const *) expr; - *value = cst->constvalue; - *isnull = cst->constisnull; - return; - } - case T_Param: - { - int paramid = ((Param *) expr)->paramid; - *value = params_info->params[paramid - 1].value; - *isnull = params_info->params[paramid - 1].isnull; - return; - } - default: - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), errmsg( - "Found unsupported YugaByte RHS expression %s", - nodeToString(expr)))); - } -} - -/* - * Returns whether an expression can be pushed down to be evaluated by YugaByte. - * Otherwise, it will need to be evaluated by Postgres as it filters the rows - * returned by YugaByte. - */ -static void ybcClassifyWhereExpr(RelOptInfo *baserel, - YbFdwPlanState *yb_state, - Expr *expr) -{ - HeapTuple tuple; - Form_pg_operator form; - /* YugaByte only supports base relations (e.g. no joins or child rels) */ - if (baserel->reloptkind == RELOPT_BASEREL) - { - - /* YugaByte only supports operator expressions (e.g. no functions) */ - if (IsA(expr, OpExpr)) - { - - /* Get operator info */ - OpExpr *opExpr = (OpExpr *) expr; - tuple = SearchSysCache1(OPEROID, ObjectIdGetDatum(opExpr->opno)); - if (!HeapTupleIsValid(tuple)) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), errmsg( - "cache lookup failed for operator %u", - opExpr->opno))); - form = (Form_pg_operator) GETSTRUCT(tuple); - char *opname = NameStr(form->oprname); - bool is_eq = strcmp(opname, "=") == 0; - /* Note: the != operator is converted to <> in the parser stage */ - bool is_ineq = strcmp(opname, ">") == 0 || - strcmp(opname, ">=") == 0 || - strcmp(opname, "<") == 0 || - strcmp(opname, "<=") == 0 || - strcmp(opname, "<>") == 0; - - ReleaseSysCache(tuple); - - /* Currently, YugaByte only supports comparison operators. */ - if (is_eq || is_ineq) - { - /* Supported operators ensure there are exactly two arguments */ - Expr *left = linitial(opExpr->args); - Expr *right = lsecond(opExpr->args); - - /* - * Currently, YugaByte only supports conds of the form ' - * ' or ' ' at this point. - * Note: Postgres should have already evaluated expressions - * with no column refs before this point. - */ - if ((IsA(left, Var) && IsSupportedPredicateExpr(right)) || - (IsSupportedPredicateExpr(left) && IsA(right, Var))) - { - AttrNumber attrNum; - attrNum = IsA(left, Var) ? ((Var *) left)->varattno - : ((Var *) right)->varattno; - - int bms_idx = attrNum - baserel->min_attr + 1; - bool is_primary = bms_is_member(bms_idx, - yb_state->primary_key); - bool is_hash = bms_is_member(bms_idx, - yb_state->hash_key); - - /* - * TODO Once we support WHERE clause in pggate, these - * conditions need to be updated accordingly. - */ - if (is_hash && is_eq) - { - yb_state->yb_cols = bms_add_member(yb_state->yb_cols, - bms_idx); - yb_state->yb_hconds = lappend(yb_state->yb_hconds, - expr); - return; - } - else if (is_primary && is_eq) - { - yb_state->yb_cols = bms_add_member(yb_state->yb_cols, - bms_idx); - yb_state->yb_rconds = lappend(yb_state->yb_rconds, - expr); - return; - } - } - } - } - } - - /* Otherwise let postgres handle the condition (default) */ - yb_state->pg_conds = lappend(yb_state->pg_conds, expr); -} - -/* - * Add a Postgres expression as a where condition to a YugaByte select - * statement. Assumes the expression can be evaluated by YugaByte - * (i.e. ybcIsYbExpression returns true). - */ -static void ybcAddWhereCond(EState *estate, Expr* expr, YBCPgStatement yb_stmt) -{ - OpExpr *opExpr = (OpExpr *) expr; - - /* - * ybcClassifyWhereExpr should only pass conditions to YugaByte if the - * assertion below holds. - */ - Assert(opExpr->args->length == 2); - Expr *left = linitial(opExpr->args); - Expr *right = lsecond(opExpr->args); - Assert((IsA(left, Var) && IsSupportedPredicateExpr(right)) || - (IsSupportedPredicateExpr(left) && IsA(right, Var))); - - Var *col_desc; - Expr *rhs_expr; - - if (IsA(left, Var)) - { - col_desc = (Var *) left; - rhs_expr = right; - } - else - { - col_desc = (Var *) right; - rhs_expr = left; - } - - Datum value; - bool is_null; - GetValFromPredicateExpr(rhs_expr, estate->es_param_list_info, &value, &is_null); - YBCPgExpr ybc_expr = YBCNewConstant(yb_stmt, - col_desc->vartype, - value, - is_null); - HandleYBStatus(YBCPgDmlBindColumn(yb_stmt, col_desc->varattno, ybc_expr)); -} - /* * ybcGetForeignRelSize * Obtain relation size estimates for a foreign table @@ -308,61 +87,10 @@ ybcGetForeignRelSize(PlannerInfo *root, RelOptInfo *baserel, Oid foreigntableid) { - Oid relid; - Relation rel = NULL; - ListCell *cell = NULL; YbFdwPlanState *ybc_plan = NULL; ybc_plan = (YbFdwPlanState *) palloc0(sizeof(YbFdwPlanState)); - relid = root->simple_rte_array[baserel->relid]->relid; - - /* - * Get table info (from both Postgres and YugaByte). - * YugaByte info is currently mainly primary and partition (hash) keys. - */ - rel = RelationIdGetRelation(relid); - YBCPgTableDesc ybc_table_desc = NULL; - HandleYBStatus(YBCPgGetTableDesc(ybc_pg_session, - YBCGetDatabaseOid(rel), - relid, - &ybc_table_desc)); - - for (AttrNumber col = baserel->min_attr; col <= baserel->max_attr; col++) - { - bool is_primary = false; - bool is_hash = false; - HandleYBTableDescStatus(YBCPgGetColumnInfo(ybc_table_desc, - col, - &is_primary, - &is_hash), ybc_table_desc); - int bms_idx = col - baserel->min_attr + 1; - if (is_hash) - { - ybc_plan->hash_key = bms_add_member(ybc_plan->hash_key, bms_idx); - } - if (is_primary) - { - ybc_plan->primary_key = bms_add_member(ybc_plan->primary_key, - bms_idx); - } - } - HandleYBStatus(YBCPgDeleteTableDesc(ybc_table_desc)); - ybc_table_desc = NULL; - RelationClose(rel); - rel = NULL; - - /* - * Split scan_clauses between those handled by YugaByte and the rest (which - * should be checked by Postgres). - * Ignore pseudoconstants (which will be handled elsewhere). - */ - foreach(cell, baserel->baserestrictinfo) - { - RestrictInfo *ri = (RestrictInfo *) lfirst(cell); - ybcClassifyWhereExpr(baserel, ybc_plan, ri->clause); - } - /* Save the output-rows estimate for the planner */ baserel->rows = DEFAULT_YB_NUM_ROWS; baserel->fdw_private = ybc_plan; @@ -397,20 +125,19 @@ ybcGetForeignPaths(PlannerInfo *root, cpu_per_tuple * baserel->rows; /* Create a ForeignPath node and it as the scan path */ - /* TODO Can add YB order guarantees to pathkeys (if hash key is fixed). */ add_path(baserel, (Path *) create_foreignscan_path(root, baserel, - NULL, /* default pathtarget */ + NULL, /* default pathtarget */ baserel->rows, startup_cost, total_cost, - NIL, /* no pathkeys */ - NULL, /* no outer rel either */ - NULL, /* no extra plan */ - NULL /* no options yet */ )); + NIL, /* no pathkeys */ + NULL, /* no outer rel either */ + NULL, /* no extra plan */ + NULL /* no options yet */ )); - /* Add secondary index paths also */ + /* Add primary key and secondary index paths also */ create_index_paths(root, baserel); } @@ -429,53 +156,11 @@ ybcGetForeignPlan(PlannerInfo *root, { YbFdwPlanState *yb_plan_state = (YbFdwPlanState *) baserel->fdw_private; Index scan_relid = baserel->relid; - List *fdw_private; ListCell *lc; - /* - * Split any unprocessed scan_clauses (i.e. joins restrictions if any) - * between those handled by YugaByte and the rest (which should be - * checked by Postgres). - * Ignore pseudoconstants (which will be handled elsewhere). - */ scan_clauses = extract_actual_clauses(scan_clauses, false); - foreach(lc, scan_clauses) - { - Expr *expr = (Expr *) lfirst(lc); - if (!list_member_ptr(yb_plan_state->yb_hconds, expr) && - !list_member_ptr(yb_plan_state->yb_rconds, expr) && - !list_member_ptr(yb_plan_state->pg_conds, expr)) - { - ybcClassifyWhereExpr(baserel, yb_plan_state, expr); - } - } - - /* - * If hash key is not fully set, we must do a full-table scan in YugaByte - * and defer all filtering to Postgres. - * Else, if primary key is not fully set we need to remove all range - * key conds and defer filtering for range column conds to Postgres. - */ - if (!bms_is_subset(yb_plan_state->hash_key, yb_plan_state->yb_cols)) - { - yb_plan_state->pg_conds = scan_clauses; - yb_plan_state->yb_hconds = NIL; - yb_plan_state->yb_rconds = NIL; - } - else if (!bms_is_subset(yb_plan_state->primary_key, yb_plan_state->yb_cols)) - { - yb_plan_state->pg_conds = list_concat(yb_plan_state->pg_conds, - yb_plan_state->yb_rconds); - yb_plan_state->yb_rconds = NIL; - } - - /* - * Get the target columns that need to be retrieved from YugaByte. - * Specifically, any columns that are either: - * 1. Referenced in the select targets (i.e. selected columns or exprs). - * 2. Referenced in the WHERE clause exprs that Postgres must evaluate. - */ + /* Get the target columns that need to be retrieved from YugaByte */ foreach(lc, baserel->reltarget->exprs) { Expr *expr = (Expr *) lfirst(lc); @@ -485,7 +170,7 @@ ybcGetForeignPlan(PlannerInfo *root, baserel->min_attr); } - foreach(lc, yb_plan_state->pg_conds) + foreach(lc, scan_clauses) { Expr *expr = (Expr *) lfirst(lc); pull_varattnos_min_attr((Node *) expr, @@ -536,17 +221,14 @@ ybcGetForeignPlan(PlannerInfo *root, } } - List *yb_conds = list_concat(yb_plan_state->yb_hconds, yb_plan_state->yb_rconds); - /* Create the ForeignScan node */ - fdw_private = list_make2(target_attrs, yb_conds); return make_foreignscan(tlist, /* target list */ - yb_plan_state->pg_conds, /* checked by Postgres */ + scan_clauses, scan_relid, NIL, /* expressions YB may evaluate (none) */ - fdw_private, /* private data for YB */ - NIL, /* custom YB target list (none for now */ - yb_conds, /* checked by YB */ + target_attrs, /* fdw_private data for YB */ + NIL, /* custom YB target list (none for now) */ + NIL, /* custom YB target list (none for now) */ outer_plan); } @@ -559,8 +241,8 @@ ybcGetForeignPlan(PlannerInfo *root, typedef struct YbFdwExecState { /* The handle for the internal YB Select statement. */ - YBCPgStatement handle; - ResourceOwner stmt_owner; + YBCPgStatement handle; + ResourceOwner stmt_owner; } YbFdwExecState; /* @@ -575,10 +257,8 @@ ybcBeginForeignScan(ForeignScanState *node, int eflags) Relation relation = node->ss.ss_currentRelation; TupleDesc tupdesc = RelationGetDescr(relation); - /* Planning function above should ensure both target and conds are set */ - Assert(foreignScan->fdw_private->length == 2); - List *target_attrs = linitial(foreignScan->fdw_private); - List *yb_conds = lsecond(foreignScan->fdw_private); + /* Planning function above should ensure target list is set */ + List *target_attrs = foreignScan->fdw_private; YbFdwExecState *ybc_state = NULL; ListCell *lc; @@ -601,13 +281,6 @@ ybcBeginForeignScan(ForeignScanState *node, int eflags) ResourceOwnerRememberYugaByteStmt(CurrentResourceOwner, ybc_state->handle); ybc_state->stmt_owner = CurrentResourceOwner; - /* Set WHERE clause values (currently only primary key). */ - foreach(lc, yb_conds) - { - Expr *expr = (Expr *) lfirst(lc); - ybcAddWhereCond(estate, expr, ybc_state->handle); - } - /* Set scan targets. */ bool has_targets = false; foreach(lc, target_attrs) diff --git a/src/postgres/src/backend/parser/parse_utilcmd.c b/src/postgres/src/backend/parser/parse_utilcmd.c index cdfeb82fb4ce..742d15536cb8 100644 --- a/src/postgres/src/backend/parser/parse_utilcmd.c +++ b/src/postgres/src/backend/parser/parse_utilcmd.c @@ -331,8 +331,7 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString) * Postprocess constraints that give rise to index definitions. * In YugaByte mode we handle ixconstraints as regular constraints below. */ - if (!IsYugaByteEnabled()) - transformIndexConstraints(&cxt); + transformIndexConstraints(&cxt); /* * Postprocess foreign-key constraints. @@ -362,7 +361,6 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString) if (IsYugaByteEnabled()) { stmt->constraints = list_concat(stmt->constraints, cxt.ixconstraints); - cxt.ixconstraints = NIL; } result = lappend(cxt.blist, stmt); diff --git a/src/postgres/src/backend/utils/cache/syscache.c b/src/postgres/src/backend/utils/cache/syscache.c index 9acf957d316f..e5d948a52445 100644 --- a/src/postgres/src/backend/utils/cache/syscache.c +++ b/src/postgres/src/backend/utils/cache/syscache.c @@ -990,65 +990,6 @@ static int SysCacheSupportingRelOidSize; static int oid_compare(const void *a, const void *b); -/* - * Returns the oid of the primary key index of the sys catalog tables in SysCache in YugaByte. - */ -Oid -YBSysTablePrimaryKeyOid(Oid relid) -{ - switch (relid) - { - case AccessMethodOperatorRelationId: return AccessMethodOperatorOidIndexId; - case AccessMethodProcedureRelationId: return AccessMethodProcedureOidIndexId; - case AccessMethodRelationId: return AmOidIndexId; - case AggregateRelationId: return AggregateFnoidIndexId; - case AttributeRelationId: return AttributeRelidNumIndexId; - case AuthIdRelationId: return AuthIdOidIndexId; - case AuthMemRelationId: return AuthMemRoleMemIndexId; - case CastRelationId: return CastOidIndexId; - case CollationRelationId: return CollationOidIndexId; - case ConstraintRelationId: return ConstraintOidIndexId; - case ConversionRelationId: return ConversionOidIndexId; - case DatabaseRelationId: return DatabaseOidIndexId; - case DefaultAclRelationId: return DefaultAclOidIndexId; - case EnumRelationId: return EnumOidIndexId; - case EventTriggerRelationId: return EventTriggerOidIndexId; - case ForeignDataWrapperRelationId: return ForeignDataWrapperOidIndexId; - case ForeignServerRelationId: return ForeignServerOidIndexId; - case ForeignTableRelationId: return ForeignTableRelidIndexId; - case IndexRelationId: return IndexRelidIndexId; - case LanguageRelationId: return LanguageOidIndexId; - case NamespaceRelationId: return NamespaceOidIndexId; - case OperatorClassRelationId: return OpclassOidIndexId; - case OperatorFamilyRelationId: return OpfamilyOidIndexId; - case OperatorRelationId: return OperatorOidIndexId; - case PartitionedRelationId: return PartitionedRelidIndexId; - case ProcedureRelationId: return ProcedureOidIndexId; - case PublicationRelRelationId: return PublicationRelObjectIndexId; - case PublicationRelationId: return PublicationObjectIndexId; - case RangeRelationId: return RangeTypidIndexId; - case RelationRelationId: return ClassOidIndexId; - case ReplicationOriginRelationId: return ReplicationOriginIdentIndex; - case RewriteRelationId: return RewriteOidIndexId; - case SequenceRelationId: return SequenceRelidIndexId; - case StatisticExtRelationId: return StatisticExtOidIndexId; - case StatisticRelationId: return StatisticRelidAttnumInhIndexId; - case SubscriptionRelRelationId: return SubscriptionRelSrrelidSrsubidIndexId; - case SubscriptionRelationId: return SubscriptionObjectIndexId; - case TSConfigMapRelationId: return TSConfigMapIndexId; - case TSConfigRelationId: return TSConfigOidIndexId; - case TSDictionaryRelationId: return TSDictionaryOidIndexId; - case TSParserRelationId: return TSParserOidIndexId; - case TSTemplateRelationId: return TSTemplateOidIndexId; - case TableSpaceRelationId: return TablespaceOidIndexId; - case TransformRelationId: return TransformOidIndexId; - case TypeRelationId: return TypeOidIndexId; - case UserMappingRelationId: return UserMappingOidIndexId; - default: break; - } - return InvalidOid; -} - Bitmapset * YBSysTablePrimaryKey(Oid relid) { diff --git a/src/postgres/src/include/access/itup.h b/src/postgres/src/include/access/itup.h index bd3a70238095..0b0fe1405e56 100644 --- a/src/postgres/src/include/access/itup.h +++ b/src/postgres/src/include/access/itup.h @@ -35,6 +35,7 @@ typedef struct IndexTupleData { ItemPointerData t_tid; /* reference TID to heap tuple */ + Datum t_ybctid; /* virtual column ybctid */ /* --------------- * t_info is laid out in the following fashion: diff --git a/src/postgres/src/include/access/relscan.h b/src/postgres/src/include/access/relscan.h index 61e2bea1291d..562ad5bfc3b2 100644 --- a/src/postgres/src/include/access/relscan.h +++ b/src/postgres/src/include/access/relscan.h @@ -78,7 +78,7 @@ typedef struct HeapScanDescData int rs_cindex; /* current tuple's index in vistuples */ int rs_ntuples; /* number of visible tuples on page */ OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */ - YbSysScanDesc ybscan; /* only valid in yb-scan case */ + YbScanDesc ybscan; /* only valid in yb-scan case */ } HeapScanDescData; /* @@ -160,7 +160,7 @@ typedef struct SysScanDescData HeapScanDesc scan; /* only valid in heap-scan case */ IndexScanDesc iscan; /* only valid in index-scan case */ Snapshot snapshot; /* snapshot to unregister at end of scan */ - YbSysScanDesc ybscan; /* only valid in yb-scan case */ + YbScanDesc ybscan; /* only valid in yb-scan case */ } SysScanDescData; #endif /* RELSCAN_H */ diff --git a/src/postgres/src/include/access/ybcam.h b/src/postgres/src/include/access/ybcam.h index 71b6e750bf26..f93a80453dd2 100644 --- a/src/postgres/src/include/access/ybcam.h +++ b/src/postgres/src/include/access/ybcam.h @@ -37,14 +37,16 @@ #include "executor/ybcExpr.h" #include "executor/ybcScan.h" -typedef struct YbSysScanDescData +typedef struct YbScanDescData { YbScanState state; int nkeys; ScanKey key; -} YbSysScanDescData; + AttrNumber sk_attno[INDEX_MAX_KEYS * 2]; + Relation index; +} YbScanDescData; -typedef struct YbSysScanDescData *YbSysScanDesc; +typedef struct YbScanDescData *YbScanDesc; /* * Access to YB-stored system catalogs (mirroring API from genam.c) @@ -52,11 +54,11 @@ typedef struct YbSysScanDescData *YbSysScanDesc; * would do either heap scan or index scan depending on the params). */ extern SysScanDesc ybc_systable_beginscan(Relation relation, - Oid indexId, - bool indexOK, - Snapshot snapshot, - int nkeys, - ScanKey key); + Oid indexId, + bool indexOK, + Snapshot snapshot, + int nkeys, + ScanKey key); extern HeapTuple ybc_systable_getnext(SysScanDesc scanDesc); extern void ybc_systable_endscan(SysScanDesc scan_desc); @@ -75,12 +77,21 @@ extern void ybc_heap_endscan(HeapScanDesc scanDesc); /* * Access to YB-stored index (mirroring API from indexam.c) * We will do a YugaByte scan instead of a heap scan. + * When the index is the primary key, the base table is scanned instead. */ -extern void ybc_index_beginscan(Relation relation, +extern void ybc_pkey_beginscan(Relation relation, + Relation index, + IndexScanDesc scan_desc, + int nkeys, + ScanKey key); +extern HeapTuple ybc_pkey_getnext(IndexScanDesc scan_desc); +extern void ybc_pkey_endscan(IndexScanDesc scan_desc); + +extern void ybc_index_beginscan(Relation index, IndexScanDesc scan_desc, int nkeys, ScanKey key); -extern HeapTuple ybc_index_getnext(IndexScanDesc scan_desc); +extern IndexTuple ybc_index_getnext(IndexScanDesc scan_desc); extern void ybc_index_endscan(IndexScanDesc scan_desc); /* diff --git a/src/postgres/src/include/commands/ybccmds.h b/src/postgres/src/include/commands/ybccmds.h index 7d5dbdd0ed96..7e34fa571892 100644 --- a/src/postgres/src/include/commands/ybccmds.h +++ b/src/postgres/src/include/commands/ybccmds.h @@ -43,12 +43,15 @@ extern void YBCReserveOids(Oid dboid, Oid next_oid, uint32 count, Oid *begin_oid /* Table Functions ----------------------------------------------------------------------------- */ -extern void YBCCreateTable(CreateStmt *stmt, char relkind, TupleDesc desc, Oid relationId, - Oid namespaceId); +extern void YBCCreateTable(CreateStmt *stmt, + char relkind, + TupleDesc desc, + Oid relationId, + Oid namespaceId); extern void YBCDropTable(Oid relationId); -extern void YBCTruncateTable(Relation rel); +extern void YBCTruncateTable(Relation rel); extern void YBCCreateIndex(const char *indexName, IndexInfo *indexInfo, @@ -62,5 +65,4 @@ extern void YBCAlterTable(AlterTableStmt* stmt, Relation rel, Oid relationId); extern void YBCRename(RenameStmt* stmt, Oid relationId); - #endif diff --git a/src/postgres/src/include/executor/ybcScan.h b/src/postgres/src/include/executor/ybcScan.h index 9632cef0ebd7..6f13ff33d171 100644 --- a/src/postgres/src/include/executor/ybcScan.h +++ b/src/postgres/src/include/executor/ybcScan.h @@ -25,6 +25,7 @@ #include "postgres.h" +#include "access/itup.h" #include "utils/resowner.h" #include "yb/yql/pggate/ybc_pggate.h" @@ -71,12 +72,12 @@ typedef struct YbScanStateData typedef YbScanStateData *YbScanState; -void ybcFreeScanState(YbScanState ybc_state); extern YbScanState ybcBeginScan(Relation rel, Relation index, List *target_attrs, List *yb_conds); -extern HeapTuple ybcFetchNext(YbScanState ybc_state); +extern HeapTuple ybcFetchNextHeapTuple(YbScanState ybc_state); +extern IndexTuple ybcFetchNextIndexTuple(YbScanState ybc_state, Relation index); extern void ybcEndScan(YbScanState ybc_handle); diff --git a/src/postgres/src/include/pg_yb_utils.h b/src/postgres/src/include/pg_yb_utils.h index ad34e7e8b9db..42636acc0a4a 100644 --- a/src/postgres/src/include/pg_yb_utils.h +++ b/src/postgres/src/include/pg_yb_utils.h @@ -57,7 +57,7 @@ extern YBCPgSession ybc_pg_session; */ extern uint64 ybc_catalog_cache_version; -/** +/* * Checks whether YugaByte functionality is enabled within PostgreSQL. * This relies on ybc_pg_session being non-NULL, so probably should not be used * in postmaster (which does not need to talk to YB backend) or early @@ -84,7 +84,7 @@ extern AttrNumber YBGetFirstLowInvalidAttributeNumber(Relation relation); extern AttrNumber YBGetFirstLowInvalidAttributeNumberFromOid(Oid relid); -/** +/* * Whether to route BEGIN / COMMIT / ROLLBACK to YugaByte's distributed * transactions. */ @@ -119,10 +119,9 @@ extern void HandleYBTableDescStatus(YBCStatus status, YBCPgTableDesc table); * YB initialization that needs to happen when a PostgreSQL backend process * is started. Reports errors using ereport. */ -extern void YBInitPostgresBackend( - const char *program_name, - const char *db_name, - const char *user_name); +extern void YBInitPostgresBackend(const char *program_name, + const char *db_name, + const char *user_name); /* * This should be called on all exit paths from the PostgreSQL backend process. @@ -130,14 +129,14 @@ extern void YBInitPostgresBackend( */ extern void YBOnPostgresBackendShutdown(); -/** +/* * Commits the current YugaByte-level transaction. Returns true in case of * successful commit and false in case of failure. If there is no transaction in * progress, also returns true. */ extern bool YBCCommitTransaction(); -/** +/* * Handle a commit error if it happened during a previous call to * YBCCommitTransaction. We allow deferring this handling in order to be able * to make PostgreSQL transaction block state transitions before calling @@ -145,24 +144,24 @@ extern bool YBCCommitTransaction(); */ extern void YBCHandleCommitError(); -/** +/* * Return true if we want to allow PostgreSQL's own locking. This is needed * while system tables are still managed by PostgreSQL. */ extern bool YBIsPgLockingEnabled(); -/** +/* * Return a string representation of the given type id, or say it is unknown. * What is returned is always a static C string constant. */ extern const char* YBPgTypeOidToStr(Oid type_id); -/** +/* * Report an error saying the given type as not supported by YugaByte. */ extern void YBReportTypeNotSupported(Oid type_id); -/** +/* * Log whether or not YugaByte is enabled. */ extern void YBReportIfYugaByteEnabled(); @@ -175,7 +174,7 @@ extern void YBReportIfYugaByteEnabled(); computed_type_id, YBPgTypeOidToStr(computed_type_id)))); \ } while (0) -/** +/* * Determines if PostgreSQL should restart all child processes if one of them * crashes. This behavior usually shows up in the log like so: * @@ -193,7 +192,7 @@ extern void YBReportIfYugaByteEnabled(); */ bool YBShouldRestartAllChildrenIfOneCrashes(); -/** +/* * Define additional inline wrappers around _Status functions that return the * real return value and ereport the error status. */ @@ -207,29 +206,29 @@ bool YBShouldRestartAllChildrenIfOneCrashes(); void YBSetPreparingTemplates(); bool YBIsPreparingTemplates(); -/** +/* * Whether every ereport of the ERROR level and higher should log a stack trace. */ bool YBShouldLogStackTraceOnError(); -/** +/* * Converts the PostgreSQL error level as listed in elog.h to a string. Always * returns a static const char string. */ const char* YBPgErrorLevelToString(int elevel); -/** +/* * Get the database name for a relation id (accounts for system databases and * shared relations) */ const char* YBCGetDatabaseName(Oid relid); -/** +/* * Get the schema name for a schema oid (accounts for system namespaces) */ const char* YBCGetSchemaName(Oid schemaoid); -/** +/* * Get the real database id of a relation. For shared relations, it will be * template1. */ diff --git a/src/postgres/src/include/utils/syscache.h b/src/postgres/src/include/utils/syscache.h index 7429d313b1dc..cd3b7ad1e451 100644 --- a/src/postgres/src/include/utils/syscache.h +++ b/src/postgres/src/include/utils/syscache.h @@ -114,7 +114,6 @@ enum SysCacheIdentifier #define SysCacheSize (USERMAPPINGUSERSERVER + 1) }; -extern Oid YBSysTablePrimaryKeyOid(Oid relid); extern Bitmapset *YBSysTablePrimaryKey(Oid relid); /* Used in IsYugaByteEnabled() mode only */ diff --git a/src/postgres/src/test/regress/expected/yb_create_index.out b/src/postgres/src/test/regress/expected/yb_create_index.out index 2b65d832310e..be79af548267 100644 --- a/src/postgres/src/test/regress/expected/yb_create_index.out +++ b/src/postgres/src/test/regress/expected/yb_create_index.out @@ -73,38 +73,45 @@ SELECT * FROM test_index ORDER BY v1; 5 | 15 | 25 (3 rows) --- These search options are not supported in an index yet. +-- Verify different WHERE conditions are supported. SELECT * FROM test_index WHERE v1 IS NULL; -ERROR: WHERE condition option 65 not supported yet -DETAIL: The WHERE condition option is not supported yet. -HINT: Rewrite the condition differently. + v1 | v2 | v3 +----+----+---- +(0 rows) + SELECT * FROM test_index WHERE v1 IS NOT NULL; -ERROR: WHERE condition option 129 not supported yet -DETAIL: The WHERE condition option is not supported yet. -HINT: Rewrite the condition differently. + v1 | v2 | v3 +----+----+---- + 5 | 15 | 25 + 1 | 11 | 21 + 4 | 14 | 24 +(3 rows) + SELECT * FROM test_index WHERE v1 IN (1, 2, 3); -ERROR: WHERE condition option 32 not supported yet -DETAIL: The WHERE condition option is not supported yet. -HINT: Rewrite the condition differently. + v1 | v2 | v3 +----+----+---- + 1 | 11 | 21 +(1 row) + -- Verify indexes on system catalog tables are updated properly CREATE TABLE test_sys_catalog_update (k int primary key, v int); -EXPLAIN SELECT relname, reltype FROM pg_class WHERE relname = 'test_sys_catalog_update'; - QUERY PLAN ------------------------------------------------------------------------------------------------ - Index Scan using pg_class_relname_nsp_index on pg_class (cost=0.00..4.01 rows=1000 width=68) +EXPLAIN SELECT relname FROM pg_class WHERE relname = 'test_sys_catalog_update'; + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Index Only Scan using pg_class_relname_nsp_index on pg_class (cost=0.00..4.01 rows=1000 width=64) Index Cond: (relname = 'test_sys_catalog_update'::name) (2 rows) -SELECT relname, reltype FROM pg_class WHERE relname = 'test_sys_catalog_update'; - relname | reltype --------------------------+--------- - test_sys_catalog_update | 26447 +SELECT relname FROM pg_class WHERE relname = 'test_sys_catalog_update'; + relname +------------------------- + test_sys_catalog_update (1 row) EXPLAIN SELECT typname FROM pg_type WHERE typname = 'test_sys_catalog_update'; - QUERY PLAN ---------------------------------------------------------------------------------------------- - Index Scan using pg_type_typname_nsp_index on pg_type (cost=0.00..4.01 rows=1000 width=64) + QUERY PLAN +-------------------------------------------------------------------------------------------------- + Index Only Scan using pg_type_typname_nsp_index on pg_type (cost=0.00..4.01 rows=1000 width=64) Index Cond: (typname = 'test_sys_catalog_update'::name) (2 rows) @@ -129,9 +136,9 @@ SELECT attname, atttypid FROM pg_attribute WHERE attname = 'v'; ALTER TABLE test_sys_catalog_update RENAME TO test_sys_catalog_update_new; ALTER TABLE test_sys_catalog_update_new RENAME COLUMN v TO w; -SELECT relname, reltype FROM pg_class WHERE relname = 'test_sys_catalog_update'; - relname | reltype ----------+--------- +SELECT relname FROM pg_class WHERE relname = 'test_sys_catalog_update'; + relname +--------- (0 rows) SELECT typname FROM pg_type WHERE typname = 'test_sys_catalog_update'; @@ -144,10 +151,10 @@ SELECT attname, atttypid FROM pg_attribute WHERE attname = 'v'; ---------+---------- (0 rows) -SELECT relname, reltype FROM pg_class WHERE relname = 'test_sys_catalog_update_new'; - relname | reltype ------------------------------+--------- - test_sys_catalog_update_new | 26447 +SELECT relname FROM pg_class WHERE relname = 'test_sys_catalog_update_new'; + relname +----------------------------- + test_sys_catalog_update_new (1 row) SELECT typname FROM pg_type WHERE typname = 'test_sys_catalog_update_new'; @@ -161,3 +168,140 @@ SELECT attname, atttypid FROM pg_attribute WHERE attname = 'w'; ---------+---------- w | 23 (1 row) + +-- Test primary key as index +CREATE TABLE t1 (h INT, r INT, v1 INT, v2 INT, PRIMARY KEY (h, r)); +CREATE INDEX ON t1 (v1); +CREATE UNIQUE INDEX ON t1 (v1, v2); +CREATE TABLE t2 (h INT, r INT, v1 INT, v2 INT, PRIMARY KEY (h, r)); +\d t1 + Table "public.t1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + h | integer | | not null | + r | integer | | not null | + v1 | integer | | | + v2 | integer | | | +Indexes: + "t1_pkey" PRIMARY KEY, btree (h, r) + "t1_v1_v2_idx" UNIQUE, btree (v1, v2) + "t1_v1_idx" btree (v1) + +\d t2 + Table "public.t2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + h | integer | | not null | + r | integer | | not null | + v1 | integer | | | + v2 | integer | | | +Indexes: + "t2_pkey" PRIMARY KEY, btree (h, r) + +INSERT INTO t1 VALUES (1, 1, 11, 11), (1, 2, 11, 12); +INSERT INTO t2 VALUES (1, 1, 21, 21); +-- The following 2 inserts should produce error due to duplicate primary key / unique index value +INSERT INTO t1 VALUES (1, 1, 99, 99); +ERROR: Query error: Duplicate key found in primary key or unique index +INSERT INTO t1 VALUES (1, 3, 11, 11); +ERROR: Query error: Duplicate key found in primary key or unique index +INSERT INTO t1 VALUES (1, 3, 11, 13), (2, 1, 12, 13), (2, 2, 12, 14); +EXPLAIN (COSTS OFF) SELECT * FROM t1 ORDER BY h, r; + QUERY PLAN +-------------------------- + Sort + Sort Key: h, r + -> Foreign Scan on t1 +(3 rows) + +SELECT * FROM t1 ORDER BY h, r; + h | r | v1 | v2 +---+---+----+---- + 1 | 1 | 11 | 11 + 1 | 2 | 11 | 12 + 1 | 3 | 11 | 13 + 2 | 1 | 12 | 13 + 2 | 2 | 12 | 14 +(5 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE h = 1 ORDER BY r; + QUERY PLAN +-------------------------------------- + Sort + Sort Key: r + -> Index Scan using t1_pkey on t1 + Index Cond: (h = 1) +(4 rows) + +SELECT * FROM t1 WHERE h = 1 ORDER BY r; + h | r | v1 | v2 +---+---+----+---- + 1 | 1 | 11 | 11 + 1 | 2 | 11 | 12 + 1 | 3 | 11 | 13 +(3 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE h > 1 ORDER BY h, r; + QUERY PLAN +-------------------------------------- + Sort + Sort Key: h, r + -> Index Scan using t1_pkey on t1 + Index Cond: (h > 1) +(4 rows) + +SELECT * FROM t1 WHERE h > 1 ORDER BY h, r; + h | r | v1 | v2 +---+---+----+---- + 2 | 1 | 12 | 13 + 2 | 2 | 12 | 14 +(2 rows) + +EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE h = 1 AND r = 1; + QUERY PLAN +------------------------------------- + Index Scan using t1_pkey on t1 + Index Cond: ((h = 1) AND (r = 1)) +(2 rows) + +SELECT * FROM t1 WHERE h = 1 AND r = 1; + h | r | v1 | v2 +---+---+----+---- + 1 | 1 | 11 | 11 +(1 row) + +EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE v1 = 11 ORDER BY h, r; + QUERY PLAN +------------------------------------------- + Sort + Sort Key: h, r + -> Index Scan using t1_v1_v2_idx on t1 + Index Cond: (v1 = 11) +(4 rows) + +SELECT * FROM t1 WHERE v1 = 11 ORDER BY h, r; + h | r | v1 | v2 +---+---+----+---- + 1 | 1 | 11 | 11 + 1 | 2 | 11 | 12 + 1 | 3 | 11 | 13 +(3 rows) + +-- Disabled this test because we do not have proper stats. We return the same cost estimate +-- for indexes t1_v1_idx and t1_v1_v2_idx and Postgres will be either of them at random. +-- EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE v1 = 11 AND v2 = 11; +-- SELECT * FROM t1 WHERE v1 = 11 AND v2 = 11; +EXPLAIN (COSTS OFF) SELECT t1.h, t1.r, t1.v1, t2.v1 FROM t1, t2 WHERE t1.h = t2.h AND t1.r = t2.r; + QUERY PLAN +------------------------------------------------- + Nested Loop + -> Foreign Scan on t1 + -> Index Scan using t2_pkey on t2 + Index Cond: ((h = t1.h) AND (r = t1.r)) +(4 rows) + +SELECT t1.h, t1.r, t1.v1, t2.v1 FROM t1, t2 WHERE t1.h = t2.h AND t1.r = t2.r; + h | r | v1 | v1 +---+---+----+---- + 1 | 1 | 11 | 21 +(1 row) diff --git a/src/postgres/src/test/regress/expected/yb_feature_select.out b/src/postgres/src/test/regress/expected/yb_feature_select.out index 85c6147718f4..d19588101d1e 100644 --- a/src/postgres/src/test/regress/expected/yb_feature_select.out +++ b/src/postgres/src/test/regress/expected/yb_feature_select.out @@ -209,7 +209,8 @@ SELECT t1.col_array_text, t2.col_name FROM feature_tab_dml t1, feature_tab_dml_identifier t2 - WHERE t1.col_smallint = t2.col_id AND t2.col_name = t1.col_array_text[2]; + WHERE t1.col_smallint = t2.col_id AND t2.col_name = t1.col_array_text[2] + ORDER BY t1.col_smallint; col_smallint | col_array_text | col_name --------------+---------------------+---------- 1 | {one,one,one} | one @@ -230,7 +231,8 @@ SELECT t2.col_name FROM feature_tab_dml t1, feature_tab_dml_identifier t2 WHERE t1.col_smallint = t2.col_id AND - (t2.col_name = 'nine' OR t2.col_name = 'seven'); + (t2.col_name = 'nine' OR t2.col_name = 'seven') + ORDER BY t1.col_smallint; col_smallint | col_array_text | col_name --------------+---------------------+---------- 7 | {seven,seven,seven} | seven diff --git a/src/postgres/src/test/regress/sql/yb_create_index.sql b/src/postgres/src/test/regress/sql/yb_create_index.sql index a69cdc1d4f2f..3b77dbf95d66 100644 --- a/src/postgres/src/test/regress/sql/yb_create_index.sql +++ b/src/postgres/src/test/regress/sql/yb_create_index.sql @@ -72,17 +72,18 @@ SELECT * FROM test_index ORDER BY v1; DELETE FROM test_index WHERE v2 = 12 OR v2 = 13; SELECT * FROM test_index ORDER BY v1; --- These search options are not supported in an index yet. +-- Verify different WHERE conditions are supported. SELECT * FROM test_index WHERE v1 IS NULL; SELECT * FROM test_index WHERE v1 IS NOT NULL; SELECT * FROM test_index WHERE v1 IN (1, 2, 3); + -- Verify indexes on system catalog tables are updated properly CREATE TABLE test_sys_catalog_update (k int primary key, v int); -EXPLAIN SELECT relname, reltype FROM pg_class WHERE relname = 'test_sys_catalog_update'; -SELECT relname, reltype FROM pg_class WHERE relname = 'test_sys_catalog_update'; +EXPLAIN SELECT relname FROM pg_class WHERE relname = 'test_sys_catalog_update'; +SELECT relname FROM pg_class WHERE relname = 'test_sys_catalog_update'; EXPLAIN SELECT typname FROM pg_type WHERE typname = 'test_sys_catalog_update'; SELECT typname FROM pg_type WHERE typname = 'test_sys_catalog_update'; @@ -93,10 +94,51 @@ SELECT attname, atttypid FROM pg_attribute WHERE attname = 'v'; ALTER TABLE test_sys_catalog_update RENAME TO test_sys_catalog_update_new; ALTER TABLE test_sys_catalog_update_new RENAME COLUMN v TO w; -SELECT relname, reltype FROM pg_class WHERE relname = 'test_sys_catalog_update'; +SELECT relname FROM pg_class WHERE relname = 'test_sys_catalog_update'; SELECT typname FROM pg_type WHERE typname = 'test_sys_catalog_update'; SELECT attname, atttypid FROM pg_attribute WHERE attname = 'v'; -SELECT relname, reltype FROM pg_class WHERE relname = 'test_sys_catalog_update_new'; +SELECT relname FROM pg_class WHERE relname = 'test_sys_catalog_update_new'; SELECT typname FROM pg_type WHERE typname = 'test_sys_catalog_update_new'; SELECT attname, atttypid FROM pg_attribute WHERE attname = 'w'; + +-- Test primary key as index +CREATE TABLE t1 (h INT, r INT, v1 INT, v2 INT, PRIMARY KEY (h, r)); +CREATE INDEX ON t1 (v1); +CREATE UNIQUE INDEX ON t1 (v1, v2); +CREATE TABLE t2 (h INT, r INT, v1 INT, v2 INT, PRIMARY KEY (h, r)); + +\d t1 +\d t2 + +INSERT INTO t1 VALUES (1, 1, 11, 11), (1, 2, 11, 12); +INSERT INTO t2 VALUES (1, 1, 21, 21); + +-- The following 2 inserts should produce error due to duplicate primary key / unique index value +INSERT INTO t1 VALUES (1, 1, 99, 99); +INSERT INTO t1 VALUES (1, 3, 11, 11); + +INSERT INTO t1 VALUES (1, 3, 11, 13), (2, 1, 12, 13), (2, 2, 12, 14); + +EXPLAIN (COSTS OFF) SELECT * FROM t1 ORDER BY h, r; +SELECT * FROM t1 ORDER BY h, r; + +EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE h = 1 ORDER BY r; +SELECT * FROM t1 WHERE h = 1 ORDER BY r; + +EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE h > 1 ORDER BY h, r; +SELECT * FROM t1 WHERE h > 1 ORDER BY h, r; + +EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE h = 1 AND r = 1; +SELECT * FROM t1 WHERE h = 1 AND r = 1; + +EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE v1 = 11 ORDER BY h, r; +SELECT * FROM t1 WHERE v1 = 11 ORDER BY h, r; + +-- Disabled this test because we do not have proper stats. We return the same cost estimate +-- for indexes t1_v1_idx and t1_v1_v2_idx and Postgres will be either of them at random. +-- EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE v1 = 11 AND v2 = 11; +-- SELECT * FROM t1 WHERE v1 = 11 AND v2 = 11; + +EXPLAIN (COSTS OFF) SELECT t1.h, t1.r, t1.v1, t2.v1 FROM t1, t2 WHERE t1.h = t2.h AND t1.r = t2.r; +SELECT t1.h, t1.r, t1.v1, t2.v1 FROM t1, t2 WHERE t1.h = t2.h AND t1.r = t2.r; diff --git a/src/postgres/src/test/regress/sql/yb_feature_select.sql b/src/postgres/src/test/regress/sql/yb_feature_select.sql index 7016e1f62a83..03261ef3790c 100644 --- a/src/postgres/src/test/regress/sql/yb_feature_select.sql +++ b/src/postgres/src/test/regress/sql/yb_feature_select.sql @@ -77,7 +77,8 @@ SELECT t1.col_array_text, t2.col_name FROM feature_tab_dml t1, feature_tab_dml_identifier t2 - WHERE t1.col_smallint = t2.col_id AND t2.col_name = t1.col_array_text[2]; + WHERE t1.col_smallint = t2.col_id AND t2.col_name = t1.col_array_text[2] + ORDER BY t1.col_smallint; -- SELECT t1.col_smallint, @@ -85,7 +86,8 @@ SELECT t2.col_name FROM feature_tab_dml t1, feature_tab_dml_identifier t2 WHERE t1.col_smallint = t2.col_id AND - (t2.col_name = 'nine' OR t2.col_name = 'seven'); + (t2.col_name = 'nine' OR t2.col_name = 'seven') + ORDER BY t1.col_smallint; -- UNION SELECT col_smallint Employee_ID, col_text Employee_Name