diff --git a/gpcontrib/gp_aux_catalog/Makefile b/gpcontrib/gp_aux_catalog/Makefile index 4ebc9168367..8c5da8f064d 100644 --- a/gpcontrib/gp_aux_catalog/Makefile +++ b/gpcontrib/gp_aux_catalog/Makefile @@ -4,7 +4,7 @@ MODULE_big = gp_aux_catalog OBJS = gp_aux_catalog.o $(WIN32RES) EXTENSION = gp_aux_catalog -DATA = gp_aux_catalog--1.0.sql +DATA = gp_aux_catalog--1.0.sql gp_aux_catalog--1.0--1.1.sql PGFILEDESC = "gp_aux_catalog - An auxiliar catalog extension for Greenplum" ifdef USE_PGXS diff --git a/gpcontrib/gp_aux_catalog/gp_aux_catalog--1.0--1.1.sql b/gpcontrib/gp_aux_catalog/gp_aux_catalog--1.0--1.1.sql new file mode 100644 index 00000000000..709a57b3e6c --- /dev/null +++ b/gpcontrib/gp_aux_catalog/gp_aux_catalog--1.0--1.1.sql @@ -0,0 +1,24 @@ + +CREATE FUNCTION +gpdb_binary_upgrade_catalog_1_0_to_1_1_m() +RETURNS VOID +AS 'MODULE_PATHNAME','gpdb_binary_upgrade_catalog_1_0_to_1_1' +VOLATILE +EXECUTE ON MASTER +LANGUAGE C STRICT; + + +CREATE FUNCTION +gpdb_binary_upgrade_catalog_1_0_to_1_1_seg() +RETURNS VOID +AS 'MODULE_PATHNAME','gpdb_binary_upgrade_catalog_1_0_to_1_1' +VOLATILE +EXECUTE ON ALL SEGMENTS +LANGUAGE C STRICT; + + +SELECT gpdb_binary_upgrade_catalog_1_0_to_1_1_seg(); +SELECT gpdb_binary_upgrade_catalog_1_0_to_1_1_m(); + +DROP FUNCTION gpdb_binary_upgrade_catalog_1_0_to_1_1_seg(); +DROP FUNCTION gpdb_binary_upgrade_catalog_1_0_to_1_1_m(); \ No newline at end of file diff --git a/gpcontrib/gp_aux_catalog/gp_aux_catalog.c b/gpcontrib/gp_aux_catalog/gp_aux_catalog.c index 83143412b8f..e7861ca4db5 100644 --- a/gpcontrib/gp_aux_catalog/gp_aux_catalog.c +++ b/gpcontrib/gp_aux_catalog/gp_aux_catalog.c @@ -1,7 +1,24 @@ #include "postgres.h" +#include "catalog/indexing.h" #include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "access/heapam.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_namespace.h" +#include "catalog/pg_authid.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_language.h" +#include "catalog/pg_type.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_amop.h" +#include "access/htup_details.h" + +/* XXX: fix this */ +#define BLOOM_NPROC 1 +/* #include "access/bloom/bloom.h" */ PG_MODULE_MAGIC; void _PG_init(void); @@ -10,6 +27,8 @@ PG_FUNCTION_INFO_V1(pg_event_trigger_ddl_commands); PG_FUNCTION_INFO_V1(pg_event_trigger_table_rewrite_oid); PG_FUNCTION_INFO_V1(pg_event_trigger_table_rewrite_reason); +PG_FUNCTION_INFO_V1(gpdb_binary_upgrade_catalog_1_0_to_1_1); + Datum pg_event_trigger_ddl_commands(PG_FUNCTION_ARGS) { @@ -26,4 +45,516 @@ Datum pg_event_trigger_table_rewrite_reason(PG_FUNCTION_ARGS) { return pg_event_trigger_table_rewrite_reason_internal(fcinfo); +} + +static void +gpdb_binary_upgrade_insert_pro_tup( + Relation rel, + Oid oid, + TupleDesc tupDesc, + const char * proname, + Oid prorettype, + uint16 nargs, + oidvector *parameterTypes) +{ + bool nulls[Natts_pg_proc]; + Datum values[Natts_pg_proc]; + HeapTuple tuple; + + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + + values[Anum_pg_proc_proname - 1] = NameGetDatum(proname); + values[Anum_pg_proc_pronamespace - 1] = ObjectIdGetDatum(PG_CATALOG_NAMESPACE); + values[Anum_pg_proc_proowner - 1] = ObjectIdGetDatum(BOOTSTRAP_SUPERUSERID); + values[Anum_pg_proc_prolang - 1] = ObjectIdGetDatum(INTERNALlanguageId); + values[Anum_pg_proc_procost - 1] = Float4GetDatum(1); + values[Anum_pg_proc_prorows - 1] = Float4GetDatum(0); + values[Anum_pg_proc_provariadic - 1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_proc_protransform - 1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_proc_proisagg - 1] = BoolGetDatum(false); + values[Anum_pg_proc_proiswindow - 1] = BoolGetDatum(false); + values[Anum_pg_proc_prosecdef - 1] = BoolGetDatum(false); + values[Anum_pg_proc_proleakproof - 1] = BoolGetDatum(false); + values[Anum_pg_proc_proisstrict - 1] = BoolGetDatum(true); + values[Anum_pg_proc_proretset - 1] = BoolGetDatum(false); + values[Anum_pg_proc_provolatile - 1] = CharGetDatum(PROVOLATILE_VOLATILE); + values[Anum_pg_proc_pronargs - 1] = UInt16GetDatum(nargs); + values[Anum_pg_proc_pronargdefaults - 1] = UInt16GetDatum(0); + values[Anum_pg_proc_prorettype - 1] = ObjectIdGetDatum(prorettype); + values[Anum_pg_proc_proargtypes - 1] = PointerGetDatum(parameterTypes); + nulls[Anum_pg_proc_proallargtypes - 1] = true; + nulls[Anum_pg_proc_proargmodes - 1] = true; + nulls[Anum_pg_proc_proargnames - 1] = true; + nulls[Anum_pg_proc_proargdefaults - 1] = true; + values[Anum_pg_proc_prosrc - 1] = CStringGetTextDatum(proname); + nulls[Anum_pg_proc_probin - 1] = true; + nulls[Anum_pg_proc_proconfig - 1] = true; + nulls[Anum_pg_proc_proacl - 1] = true; + /* proacl will be determined later */ + values[Anum_pg_proc_prodataaccess - 1] = CharGetDatum(PRODATAACCESS_NONE); + values[Anum_pg_proc_proexeclocation - 1] = CharGetDatum(PROEXECLOCATION_ANY); + + tuple = heap_form_tuple(tupDesc, values, nulls); + + if (tupDesc->tdhasoid) + HeapTupleSetOid(tuple, oid); + else + elog(ERROR, "failed to upgrade"); + + simple_heap_insert(rel, tuple); + + CatalogUpdateIndexes(rel, tuple); + heap_freetuple(tuple); +} + +#define F_BLOOMAMOID 7214 + +static void +gpdb_binary_upgrade_insert_am_tup( + Relation rel, + TupleDesc tupDesc +) +{ + bool nulls[Natts_pg_am]; + Datum values[Natts_pg_am]; + HeapTuple tuple; + + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + + values[Anum_pg_am_amname - 1] = NameGetDatum("bloom"); + values[Anum_pg_am_amstrategies - 1] = Int16GetDatum(0); + values[Anum_pg_am_amsupport - 1] = Int16GetDatum(BLOOM_NPROC); + values[Anum_pg_am_amcanorder - 1] = BoolGetDatum(false); + values[Anum_pg_am_amcanorderbyop - 1] = BoolGetDatum(false); + values[Anum_pg_am_amcanbackward - 1] = BoolGetDatum(false); + values[Anum_pg_am_amcanunique - 1] = BoolGetDatum(false); + values[Anum_pg_am_amcanmulticol - 1] = BoolGetDatum(true); + values[Anum_pg_am_amoptionalkey - 1] = BoolGetDatum(true); + values[Anum_pg_am_amsearcharray - 1] = BoolGetDatum(false); + values[Anum_pg_am_amsearchnulls - 1] = BoolGetDatum(false); + values[Anum_pg_am_amstorage - 1] = BoolGetDatum(false); + values[Anum_pg_am_amclusterable - 1] = BoolGetDatum(false); + values[Anum_pg_am_ampredlocks - 1] = BoolGetDatum(false); + values[Anum_pg_am_amkeytype - 1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_am_aminsert - 1] = ObjectIdGetDatum(F_BLINSERT); + values[Anum_pg_am_ambeginscan - 1] = ObjectIdGetDatum(F_BLBEGINSCAN); + values[Anum_pg_am_amgettuple - 1] = ObjectIdGetDatum(InvalidOid); /* F_BLGETTUPLE ? */ + values[Anum_pg_am_amgetbitmap - 1] = ObjectIdGetDatum(F_BLGETBITMAP); + values[Anum_pg_am_amrescan - 1] = ObjectIdGetDatum(F_BLRESCAN); + values[Anum_pg_am_amendscan - 1] = ObjectIdGetDatum(F_BLENDSCAN); + values[Anum_pg_am_ammarkpos - 1] = ObjectIdGetDatum(F_BLMARKPOS); + values[Anum_pg_am_amrestrpos - 1] = ObjectIdGetDatum(F_BLRESTRPOS); + values[Anum_pg_am_ambuild - 1] = ObjectIdGetDatum(F_BLBUILD); + values[Anum_pg_am_ambuildempty - 1] = ObjectIdGetDatum(F_BLBUILDEMPTY); + values[Anum_pg_am_ambulkdelete - 1] = ObjectIdGetDatum(F_BLBULKDELETE); + values[Anum_pg_am_amvacuumcleanup - 1] = ObjectIdGetDatum(F_BLVACUUMCLEANUP); + values[Anum_pg_am_amcanreturn - 1] = ObjectIdGetDatum(InvalidOid); + values[Anum_pg_am_amcostestimate - 1] = ObjectIdGetDatum(F_BLCOSTESTIMATE); + values[Anum_pg_am_amoptions - 1] = ObjectIdGetDatum(F_BLOPTIONS); + + + tuple = heap_form_tuple(tupDesc, values, nulls); + + + if (tupDesc->tdhasoid) + HeapTupleSetOid(tuple, F_BLOOMAMOID); + else + elog(ERROR, "failed to upgrade"); + + simple_heap_insert(rel, tuple); + + CatalogUpdateIndexes(rel, tuple); + heap_freetuple(tuple); +} + + +#define F_BLOPFAMILYOID 7215 + +static void +gpdb_binary_upgrade_insert_opfamily_tup(Relation rel, const char * opfname) +{ + HeapTuple tup; + Datum values[Natts_pg_opfamily]; + bool nulls[Natts_pg_opfamily]; + NameData opfName; + Oid opfamilyoid; + + /* + * Okay, let's create the pg_opfamily entry. + */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + + values[Anum_pg_opfamily_opfmethod - 1] = ObjectIdGetDatum(F_BLOOMAMOID); + namestrcpy(&opfName, opfname); + values[Anum_pg_opfamily_opfname - 1] = NameGetDatum(&opfName); + values[Anum_pg_opfamily_opfnamespace - 1] = ObjectIdGetDatum(PG_CATALOG_NAMESPACE); + values[Anum_pg_opfamily_opfowner - 1] = ObjectIdGetDatum(BOOTSTRAP_SUPERUSERID); + + tup = heap_form_tuple(rel->rd_att, values, nulls); + + if (rel->rd_att->tdhasoid) + HeapTupleSetOid(tup, F_BLOPFAMILYOID); + else + elog(ERROR, "failed to upgrade"); + + opfamilyoid = simple_heap_insert(rel, tup); + + CatalogUpdateIndexes(rel, tup); + + heap_freetuple(tup); +} + +#define F_BLOPCLASSINT4OID 7216 + +static void +gpdb_binary_upgrade_insert_opclass_tup(Relation rel, const char * opcname) +{ + HeapTuple tup; + Datum values[Natts_pg_opclass]; + bool nulls[Natts_pg_opclass]; + NameData opcName; + + /* + * Okay, let's create the pg_opfamily entry. + */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + + values[Anum_pg_opclass_opcmethod - 1] = ObjectIdGetDatum(F_BLOOMAMOID); + namestrcpy(&opcName, opcname); + values[Anum_pg_opclass_opcname - 1] = NameGetDatum(&opcName); + values[Anum_pg_opclass_opcnamespace - 1] = ObjectIdGetDatum(PG_CATALOG_NAMESPACE); + values[Anum_pg_opclass_opcowner - 1] = ObjectIdGetDatum(BOOTSTRAP_SUPERUSERID); + values[Anum_pg_opclass_opcfamily - 1] = ObjectIdGetDatum(F_BLOPFAMILYOID); + values[Anum_pg_opclass_opcintype - 1] = ObjectIdGetDatum(INT4OID); + values[Anum_pg_opclass_opcdefault - 1] = BoolGetDatum(true); + values[Anum_pg_opclass_opckeytype - 1] = ObjectIdGetDatum(InvalidOid); + + tup = heap_form_tuple(rel->rd_att, values, nulls); + + HeapTupleSetOid(tup, F_BLOPCLASSINT4OID); + (void)simple_heap_insert(rel, tup); + + CatalogUpdateIndexes(rel, tup); + + heap_freetuple(tup); +} + + +#define F_BLAMPROCINT4OID 7217 + +static void +gpdb_binary_upgrade_insert_amproc_tup(Relation rel) { + HeapTuple tup; + Datum values[Natts_pg_amproc]; + bool nulls[Natts_pg_amproc]; + /* Create the pg_amproc entry */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + + values[Anum_pg_amproc_amprocfamily - 1] = ObjectIdGetDatum(F_BLOPFAMILYOID); + values[Anum_pg_amproc_amproclefttype - 1] = ObjectIdGetDatum(INT4OID); + values[Anum_pg_amproc_amprocrighttype - 1] = ObjectIdGetDatum(INT4OID); + values[Anum_pg_amproc_amprocnum - 1] = Int16GetDatum(1); + values[Anum_pg_amproc_amproc - 1] = ObjectIdGetDatum(F_BTINT4CMP); + + tup = heap_form_tuple(rel->rd_att, values, nulls); + + HeapTupleSetOid(tup, F_BLAMPROCINT4OID); + (void) simple_heap_insert(rel, tup); + + CatalogUpdateIndexes(rel, tup); + + heap_freetuple(tup); +} + + +#define F_BLINT4EQOPOID 96 +#define F_BLAMOPINT4OID 7218 + +static void +gpdb_binary_upgrade_insert_amop_tup(Relation rel) { + HeapTuple tup; + Datum values[Natts_pg_amop]; + bool nulls[Natts_pg_amop]; + /* Create the pg_amproc entry */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + + + values[Anum_pg_amop_amopfamily - 1] = ObjectIdGetDatum(F_BLOPFAMILYOID); + values[Anum_pg_amop_amoplefttype - 1] = ObjectIdGetDatum(INT4OID); + values[Anum_pg_amop_amoprighttype - 1] = ObjectIdGetDatum(INT4OID); + values[Anum_pg_amop_amopstrategy - 1] = Int16GetDatum(BLOOM_NPROC); + values[Anum_pg_amop_amoppurpose - 1] = CharGetDatum(AMOP_SEARCH); + values[Anum_pg_amop_amopopr - 1] = ObjectIdGetDatum(F_BLINT4EQOPOID); + values[Anum_pg_amop_amopmethod - 1] = ObjectIdGetDatum(F_BLOOMAMOID); + values[Anum_pg_amop_amopsortfamily - 1] = ObjectIdGetDatum(InvalidOid); + + tup = heap_form_tuple(rel->rd_att, values, nulls); + + HeapTupleSetOid(tup, F_BLAMOPINT4OID); + (void) simple_heap_insert(rel, tup); + + CatalogUpdateIndexes(rel, tup); + + heap_freetuple(tup); +} + +/* +* +* extern Datum blbuild(PG_FUNCTION_ARGS); +* extern Datum blbuildempty(PG_FUNCTION_ARGS); +* extern Datum blinsert(PG_FUNCTION_ARGS); +* extern Datum blbeginscan(PG_FUNCTION_ARGS); +* extern Datum blgettuple(PG_FUNCTION_ARGS); +* extern Datum blgetbitmap(PG_FUNCTION_ARGS); +* extern Datum blrescan(PG_FUNCTION_ARGS); +* extern Datum blendscan(PG_FUNCTION_ARGS); +* extern Datum blmarkpos(PG_FUNCTION_ARGS); +* extern Datum blrestrpos(PG_FUNCTION_ARGS); +* extern Datum blbulkdelete(PG_FUNCTION_ARGS); +* extern Datum blvacuumcleanup(PG_FUNCTION_ARGS); +* extern Datum bloptions(PG_FUNCTION_ARGS); +*/ + + +Datum +gpdb_binary_upgrade_catalog_1_0_to_1_1(PG_FUNCTION_ARGS) +{ + Relation pgprocrel; + Relation pgamrel; + Relation pgopcrel; + Relation pgopfrel; + Relation pgamprocrel; + Relation pgamoprel; + + TupleDesc tupDesc; + + pgprocrel = relation_open(ProcedureRelationId, RowExclusiveLock); + pgamrel = relation_open(AccessMethodRelationId, RowExclusiveLock); + pgopcrel = relation_open(OperatorClassRelationId, RowExclusiveLock); + pgopfrel = relation_open(OperatorFamilyRelationId, RowExclusiveLock); + pgamprocrel = relation_open(AccessMethodProcedureRelationId, RowExclusiveLock); + pgamoprel = relation_open(AccessMethodOperatorRelationId, RowExclusiveLock); + + tupDesc = RelationGetDescr(pgprocrel); + + { +#define BLBUILD_NARGS 3 + Oid procArgTypes[BLBUILD_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "blbuild"; + + for (int i = 0; i < BLBUILD_NARGS; ++i) + procArgTypes[i] = INTERNALOID; + + parameterTypes = buildoidvector(procArgTypes, BLBUILD_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLBUILD, tupDesc, proname, INTERNALOID, BLBUILD_NARGS, parameterTypes); + } + + { + +#define BLBUILDEMPTY_NARGS 1 + Oid procArgTypes[BLBUILDEMPTY_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "blbuildempty"; + + for (int i = 0; i < BLBUILDEMPTY_NARGS; ++i) + procArgTypes[i] = INTERNALOID; + + parameterTypes = buildoidvector(procArgTypes, BLBUILDEMPTY_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLBUILDEMPTY, tupDesc, proname, INTERNALOID, BLBUILDEMPTY_NARGS, parameterTypes); + } + + { +#define BLINSERT_NARGS 6 + Oid procArgTypes[BLINSERT_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "blinsert"; + + for (int i = 0; i < BLINSERT_NARGS; ++i) + procArgTypes[i] = INTERNALOID; + + parameterTypes = buildoidvector(procArgTypes, BLINSERT_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLINSERT, tupDesc, proname, BOOLOID, BLINSERT_NARGS, parameterTypes); + } + + + { +#define BLBEGINSCAN_NARGS 3 + Oid procArgTypes[BLBEGINSCAN_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "blbeginscan"; + + for (int i = 0; i < BLBEGINSCAN_NARGS; ++i) + procArgTypes[i] = INTERNALOID; + + parameterTypes = buildoidvector(procArgTypes, BLBEGINSCAN_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLBEGINSCAN, tupDesc, proname, INTERNALOID, BLBEGINSCAN_NARGS, parameterTypes); + } + + { +#define BLGETTUPLE_NARGS 2 + Oid procArgTypes[BLGETTUPLE_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "blgettuple"; + + for (int i = 0; i < BLGETTUPLE_NARGS; ++i) + procArgTypes[i] = INTERNALOID; + + parameterTypes = buildoidvector(procArgTypes, BLGETTUPLE_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLGETTUPLE, tupDesc, proname, BOOLOID, BLGETTUPLE_NARGS, parameterTypes); + } + + { +#define BLGETBITMAP_NARGS 2 + Oid procArgTypes[BLGETBITMAP_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "blgetbitmap"; + + for (int i = 0; i < BLGETBITMAP_NARGS; ++i) + procArgTypes[i] = INTERNALOID; + + parameterTypes = buildoidvector(procArgTypes, BLGETBITMAP_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLGETBITMAP, tupDesc, proname, BOOLOID, BLGETBITMAP_NARGS, parameterTypes); + } + + { +#define BLRESCAN_NARGS 2 + Oid procArgTypes[BLRESCAN_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "blrescan"; + + for (int i = 0; i < BLRESCAN_NARGS; ++i) + procArgTypes[i] = INTERNALOID; + + parameterTypes = buildoidvector(procArgTypes, BLRESCAN_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLRESCAN, tupDesc, proname, VOIDOID, BLRESCAN_NARGS, parameterTypes); + } + + { +#define BLENDSCAN_NARGS 2 + Oid procArgTypes[BLENDSCAN_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "blendscan"; + + for (int i = 0; i < BLENDSCAN_NARGS; ++i) + procArgTypes[i] = INTERNALOID; + + parameterTypes = buildoidvector(procArgTypes, BLENDSCAN_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLENDSCAN, tupDesc, proname, VOIDOID, BLENDSCAN_NARGS, parameterTypes); + } + + + { +#define BLMARKPOS_NARGS 1 + Oid procArgTypes[BLMARKPOS_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "blmarkpos"; + + for (int i = 0; i < BLMARKPOS_NARGS; ++i) + procArgTypes[i] = INTERNALOID; + + parameterTypes = buildoidvector(procArgTypes, BLMARKPOS_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLMARKPOS, tupDesc, proname, VOIDOID, BLMARKPOS_NARGS, parameterTypes); + } + + { +#define BLRESTRPOS_NARGS 1 + Oid procArgTypes[BLRESTRPOS_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "blrestrpos"; + + for (int i = 0; i < BLRESTRPOS_NARGS; ++i) + procArgTypes[i] = INTERNALOID; + + parameterTypes = buildoidvector(procArgTypes, BLRESTRPOS_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLRESTRPOS, tupDesc, proname, VOIDOID, BLRESTRPOS_NARGS, parameterTypes); + } + + + { +#define BLBULKDELETE_NARGS 4 + Oid procArgTypes[BLBULKDELETE_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "blbulkdelete"; + + for (int i = 0; i < BLBULKDELETE_NARGS; ++i) + procArgTypes[i] = INTERNALOID; + + parameterTypes = buildoidvector(procArgTypes, BLBULKDELETE_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLBULKDELETE, tupDesc, proname, INTERNALOID, BLBULKDELETE_NARGS, parameterTypes); + } + + { +#define BLVACUUMCLEANUP_NARGS 2 + Oid procArgTypes[BLVACUUMCLEANUP_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "blvacuumcleanup"; + + for (int i = 0; i < BLVACUUMCLEANUP_NARGS; ++i) + procArgTypes[i] = INTERNALOID; + + parameterTypes = buildoidvector(procArgTypes, BLVACUUMCLEANUP_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLVACUUMCLEANUP, tupDesc, proname, INTERNALOID, BLVACUUMCLEANUP_NARGS, parameterTypes); + } + + { +#define BLOPTIONS_NARGS 2 + Oid procArgTypes[BLOPTIONS_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "bloptions"; + + procArgTypes[0] = TEXTARRAYOID; + procArgTypes[1] = BOOLOID; + + parameterTypes = buildoidvector(procArgTypes, BLOPTIONS_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLOPTIONS, tupDesc, proname, BYTEAOID, BLOPTIONS_NARGS, parameterTypes); + } + + { +#define BLCOSTESTIMATE_NARGS 7 + Oid procArgTypes[BLCOSTESTIMATE_NARGS]; + oidvector *parameterTypes; + + char proname[NAMEDATALEN] = "blcostestimate"; + + for (int i = 0; i < BLCOSTESTIMATE_NARGS; ++i) + procArgTypes[i] = INTERNALOID; + + parameterTypes = buildoidvector(procArgTypes, BLCOSTESTIMATE_NARGS); + gpdb_binary_upgrade_insert_pro_tup(pgprocrel, F_BLCOSTESTIMATE, tupDesc, proname, VOIDOID, BLCOSTESTIMATE_NARGS, parameterTypes); + } + + gpdb_binary_upgrade_insert_am_tup(pgamrel, RelationGetDescr(pgamrel)); + gpdb_binary_upgrade_insert_opfamily_tup(pgopfrel, "int4_ops"); + gpdb_binary_upgrade_insert_opclass_tup(pgopcrel, "int4_ops"); + gpdb_binary_upgrade_insert_amproc_tup(pgamprocrel); + gpdb_binary_upgrade_insert_amop_tup(pgamoprel); + + relation_close(pgopcrel, RowExclusiveLock); + relation_close(pgopfrel, RowExclusiveLock); + relation_close(pgprocrel, RowExclusiveLock); + relation_close(pgamrel, RowExclusiveLock); + relation_close(pgamprocrel, RowExclusiveLock); + relation_close(pgamoprel, RowExclusiveLock); + + PG_RETURN_VOID(); } \ No newline at end of file diff --git a/gpcontrib/yezzey b/gpcontrib/yezzey index 8890636a232..e0c707cdda7 160000 --- a/gpcontrib/yezzey +++ b/gpcontrib/yezzey @@ -1 +1 @@ -Subproject commit 8890636a232e2df06450da1d177f127218f55a5e +Subproject commit e0c707cdda7235c454b5c74d493c2189badfeb16 diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index 9613f7cfe3d..02cf4c83850 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -8,6 +8,6 @@ subdir = src/backend/access top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = common gin gist hash heap index nbtree rmgrdesc spgist transam external bitmap appendonly aocs +SUBDIRS = common gin gist hash heap index nbtree rmgrdesc spgist transam external bitmap appendonly aocs bloom include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/bloom/.gitignore b/src/backend/access/bloom/.gitignore new file mode 100644 index 00000000000..5dcb3ff9723 --- /dev/null +++ b/src/backend/access/bloom/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/src/backend/access/bloom/Makefile b/src/backend/access/bloom/Makefile new file mode 100644 index 00000000000..2ab909aafa7 --- /dev/null +++ b/src/backend/access/bloom/Makefile @@ -0,0 +1,15 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/bloom +# +#------------------------------------------------------------------------- + + +subdir = src/backend/access/bloom +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = blinsert.o blscan.o blutils.o blvacuum.o bloom.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/bloom/blinsert.c b/src/backend/access/bloom/blinsert.c new file mode 100644 index 00000000000..ffb6ebaab51 --- /dev/null +++ b/src/backend/access/bloom/blinsert.c @@ -0,0 +1,367 @@ +/*------------------------------------------------------------------------- + * + * blinsert.c + * Bloom index build and insert functions. + * + * Copyright (c) 2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/bloom/blinsert.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "catalog/index.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/indexfsm.h" +#include "storage/smgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "access/heapam_xlog.h" + +#include "bloom.h" + +/* + * State of bloom index build. We accumulate one page data here before + * flushing it to buffer manager. + */ +typedef struct +{ + BloomState blstate; /* bloom index state */ + int64 indtuples; /* total number of tuples indexed */ + MemoryContext tmpCtx; /* temporary memory context reset after each + * tuple */ + PGAlignedBlock data; /* cached page */ + int count; /* number of tuples in cached page */ +} BloomBuildState; + +/* + * Flush page cached in BloomBuildState. + */ +static void +flushCachedPage(Relation index, BloomBuildState *buildstate) +{ + Page page; + Buffer buffer = BloomNewBuffer(index); + + page = BufferGetPage(buffer); + memcpy(page, buildstate->data.data, BLCKSZ); + + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); +} + +/* + * (Re)initialize cached page in BloomBuildState. + */ +static void +initCachedPage(BloomBuildState *buildstate) +{ + memset(buildstate->data.data, 0, BLCKSZ); + BloomInitPage(buildstate->data.data, 0); + buildstate->count = 0; +} + +/* + * Per-tuple callback from IndexBuildHeapScan. + */ +static void +bloomBuildCallback(Relation index, HeapTuple htup, Datum *values, + bool *isnull, bool tupleIsAlive, void *state) +{ + BloomBuildState *buildstate = (BloomBuildState *) state; + MemoryContext oldCtx; + BloomTuple *itup; + + oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); + + itup = BloomFormTuple(&buildstate->blstate, &htup->t_self, values, isnull); + + /* Try to add next item to cached page */ + if (BloomPageAddItem(&buildstate->blstate, buildstate->data.data, itup)) + { + /* Next item was added successfully */ + buildstate->count++; + } + else + { + /* Cached page is full, flush it out and make a new one */ + flushCachedPage(index, buildstate); + + CHECK_FOR_INTERRUPTS(); + + initCachedPage(buildstate); + + if (!BloomPageAddItem(&buildstate->blstate, buildstate->data.data, itup)) + { + /* We shouldn't be here since we're inserting to the empty page */ + elog(ERROR, "could not add new bloom tuple to empty page"); + } + + /* Next item was added successfully */ + buildstate->count++; + } + + /* Update total tuple count */ + buildstate->indtuples += 1; + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(buildstate->tmpCtx); +} + +/* + * Build a new bloom index. + */ +Datum +blbuild(PG_FUNCTION_ARGS) +{ + IndexBuildResult *result; + double reltuples; + BloomBuildState buildstate; + + Relation heap = (Relation) PG_GETARG_POINTER(0); + Relation index = (Relation) PG_GETARG_POINTER(1); + IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); + + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + /* Initialize the meta page */ + BloomInitMetapage(index); + + /* Initialize the bloom build state */ + memset(&buildstate, 0, sizeof(buildstate)); + initBloomState(&buildstate.blstate, index); + buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, + "Bloom build temporary context", + ALLOCSET_DEFAULT_SIZES); + initCachedPage(&buildstate); + + /* Do the heap scan */ + reltuples = IndexBuildScan(heap, index, indexInfo, true, + bloomBuildCallback, (void *) &buildstate); + + /* Flush last page if needed (it will be, unless heap was empty) */ + if (buildstate.count > 0) + flushCachedPage(index, &buildstate); + + MemoryContextDelete(buildstate.tmpCtx); + + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + result->heap_tuples = reltuples; + result->index_tuples = buildstate.indtuples; + + PG_RETURN_POINTER(result); +} + +/* + * Build an empty bloom index in the initialization fork. + */ + +Datum +blbuildempty(PG_FUNCTION_ARGS) +{ + Page metapage; + + Relation index = (Relation) PG_GETARG_POINTER(0); + + /* Construct metapage. */ + metapage = (Page) palloc(BLCKSZ); + BloomFillMetapage(index, metapage); + + /* + * Write the page and log it. It might seem that an immediate sync + * would be sufficient to guarantee that the file exists on disk, but + * recovery itself might remove it while replaying, for example, an + * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we + * need this even when wal_level=minimal. + */ + PageSetChecksumInplace(metapage, BLOOM_METAPAGE_BLKNO); + smgrwrite(index->rd_smgr, INIT_FORKNUM, BLOOM_METAPAGE_BLKNO, + (char *) metapage, true); + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + BLOOM_METAPAGE_BLKNO, metapage, false); + + /* + * An immediate sync is required even if we xlog'd the page, because the + * write did not go through shared_buffers and therefore a concurrent + * checkpoint may have moved the redo pointer past our xlog record. + */ + smgrimmedsync(index->rd_smgr, INIT_FORKNUM); + PG_RETURN_VOID(); +} + +/* + * Insert new tuple to the bloom index. + */ +Datum +blinsert(PG_FUNCTION_ARGS) +{ + BloomState blstate; + BloomTuple *itup; + MemoryContext oldCtx; + MemoryContext insertCtx; + BloomMetaPageData *metaData; + Buffer buffer, + metaBuffer; + Page page, + metaPage; + BlockNumber blkno = InvalidBlockNumber; + OffsetNumber nStart; + + + Relation index = (Relation) PG_GETARG_POINTER(0); + Datum *values = (Datum *) PG_GETARG_POINTER(1); + bool *isnull = (bool *) PG_GETARG_POINTER(2); + ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3); + + insertCtx = AllocSetContextCreate(CurrentMemoryContext, + "Bloom insert temporary context", + ALLOCSET_DEFAULT_SIZES); + + oldCtx = MemoryContextSwitchTo(insertCtx); + + initBloomState(&blstate, index); + itup = BloomFormTuple(&blstate, ht_ctid, values, isnull); + + /* + * At first, try to insert new tuple to the first page in notFullPage + * array. If successful, we don't need to modify the meta page. + */ + metaBuffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO); + LockBuffer(metaBuffer, BUFFER_LOCK_SHARE); + metaData = BloomPageGetMeta(BufferGetPage(metaBuffer)); + + if (metaData->nEnd > metaData->nStart) + { + Page page; + + blkno = metaData->notFullPage[metaData->nStart]; + Assert(blkno != InvalidBlockNumber); + + /* Don't hold metabuffer lock while doing insert */ + LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); + + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buffer); + + /* + * We might have found a page that was recently deleted by VACUUM. If + * so, we can reuse it, but we must reinitialize it. + */ + if (PageIsNew(page) || BloomPageIsDeleted(page)) + BloomInitPage(page, 0); + + if (BloomPageAddItem(&blstate, page, itup)) + { + /* Success! Apply the change, clean up, and exit */ + + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); + ReleaseBuffer(metaBuffer); + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(insertCtx); + PG_RETURN_BOOL(false); + } + + /* Didn't fit, must try other pages */ + UnlockReleaseBuffer(buffer); + } + else + { + /* No entries in notFullPage */ + LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); + } + + /* + * Try other pages in notFullPage array. We will have to change nStart in + * metapage. Thus, grab exclusive lock on metapage. + */ + LockBuffer(metaBuffer, BUFFER_LOCK_EXCLUSIVE); + + /* nStart might have changed while we didn't have lock */ + nStart = metaData->nStart; + + /* Skip first page if we already tried it above */ + if (nStart < metaData->nEnd && + blkno == metaData->notFullPage[nStart]) + nStart++; + + /* + * This loop iterates for each page we try from the notFullPage array, and + * will also initialize a GenericXLogState for the fallback case of having + * to allocate a new page. + */ + for (;;) + { + /* get modifiable copy of metapage */ + metaPage = BufferGetPage(metaBuffer); + metaData = BloomPageGetMeta(metaPage); + + if (nStart >= metaData->nEnd) + break; /* no more entries in notFullPage array */ + + blkno = metaData->notFullPage[nStart]; + Assert(blkno != InvalidBlockNumber); + + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buffer); + + /* Basically same logic as above */ + if (PageIsNew(page) || BloomPageIsDeleted(page)) + BloomInitPage(page, 0); + + if (BloomPageAddItem(&blstate, page, itup)) + { + /* Success! Apply the changes, clean up, and exit */ + + metaData->nStart = nStart; + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); + UnlockReleaseBuffer(metaBuffer); + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(insertCtx); + PG_RETURN_BOOL(false); + } + + /* Didn't fit, must try other pages */ + UnlockReleaseBuffer(buffer); + nStart++; + } + + /* + * Didn't find place to insert in notFullPage array. Allocate new page. + * (XXX is it good to do this while holding ex-lock on the metapage??) + */ + buffer = BloomNewBuffer(index); + + page = BufferGetPage(buffer); + BloomInitPage(page, 0); + + if (!BloomPageAddItem(&blstate, page, itup)) + { + /* We shouldn't be here since we're inserting to an empty page */ + elog(ERROR, "could not add new bloom tuple to empty page"); + } + + /* Reset notFullPage array to contain just this new page */ + metaData->nStart = 0; + metaData->nEnd = 1; + metaData->notFullPage[0] = BufferGetBlockNumber(buffer); + + /* Apply the changes, clean up, and exit */ + + UnlockReleaseBuffer(buffer); + UnlockReleaseBuffer(metaBuffer); + + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(insertCtx); + + PG_RETURN_BOOL(false); +} diff --git a/src/backend/access/bloom/bloom--1.0.sql b/src/backend/access/bloom/bloom--1.0.sql new file mode 100644 index 00000000000..4e7c9226bcd --- /dev/null +++ b/src/backend/access/bloom/bloom--1.0.sql @@ -0,0 +1,25 @@ +/* contrib/bloom/bloom--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION bloom" to load this file. \quit + +CREATE FUNCTION blhandler(internal) +RETURNS index_am_handler +AS 'MODULE_PATHNAME' +LANGUAGE C; + +-- Access method +CREATE ACCESS METHOD bloom TYPE INDEX HANDLER blhandler; +COMMENT ON ACCESS METHOD bloom IS 'bloom index access method'; + +-- Opclasses + +CREATE OPERATOR CLASS int4_ops +DEFAULT FOR TYPE int4 USING bloom AS + OPERATOR 1 =(int4, int4), + FUNCTION 1 hashint4(int4); + +CREATE OPERATOR CLASS text_ops +DEFAULT FOR TYPE text USING bloom AS + OPERATOR 1 =(text, text), + FUNCTION 1 hashtext(text); diff --git a/src/backend/access/bloom/bloom.c b/src/backend/access/bloom/bloom.c new file mode 100644 index 00000000000..77600658217 --- /dev/null +++ b/src/backend/access/bloom/bloom.c @@ -0,0 +1,40 @@ + +#include "postgres.h" + + +#include "bloom.h" + + +Datum +blgettuple(PG_FUNCTION_ARGS) +{ + elog(ERROR, "unsupported"); + + PG_RETURN_BOOL(false); +} + + +/* + * bmmarkpos() -- save the current scan position. + */ +Datum +blmarkpos(PG_FUNCTION_ARGS) +{ + + elog(ERROR, "unsupported"); + + PG_RETURN_VOID(); +} + +/* + * bmrestrpos() -- restore a scan to the last saved position. + */ +Datum +blrestrpos(PG_FUNCTION_ARGS) +{ + + elog(ERROR, "unsupported"); + + PG_RETURN_VOID(); +} + diff --git a/src/backend/access/bloom/bloom.control b/src/backend/access/bloom/bloom.control new file mode 100644 index 00000000000..4d4124b3b05 --- /dev/null +++ b/src/backend/access/bloom/bloom.control @@ -0,0 +1,5 @@ +# bloom extension +comment = 'bloom access method - signature file based index' +default_version = '1.0' +module_pathname = '$libdir/bloom' +relocatable = true diff --git a/src/backend/access/bloom/bloom.h b/src/backend/access/bloom/bloom.h new file mode 100644 index 00000000000..a1b275fcc6b --- /dev/null +++ b/src/backend/access/bloom/bloom.h @@ -0,0 +1,201 @@ +/*------------------------------------------------------------------------- + * + * bloom.h + * Header for bloom index. + * + * Copyright (c) 2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/bloom/bloom.h + * + *------------------------------------------------------------------------- + */ +#ifndef _BLOOM_H_ +#define _BLOOM_H_ + +#include "access/itup.h" +#include "access/xlog.h" +#include "nodes/relation.h" +#include "fmgr.h" + +/* Support procedures numbers */ +#define BLOOM_HASH_PROC 1 +#define BLOOM_NPROC 1 + +/* Scan strategies */ +#define BLOOM_EQUAL_STRATEGY 1 +#define BLOOM_NSTRATEGIES 1 + +/* Opaque for bloom pages */ +typedef struct BloomPageOpaqueData +{ + OffsetNumber maxoff; /* number of index tuples on page */ + uint16 flags; /* see bit definitions below */ + uint16 unused; /* placeholder to force maxaligning of size of + * BloomPageOpaqueData and to place + * bloom_page_id exactly at the end of page */ + uint16 bloom_page_id; /* for identification of BLOOM indexes */ +} BloomPageOpaqueData; + +typedef BloomPageOpaqueData *BloomPageOpaque; + +/* Bloom page flags */ +#define BLOOM_META (1<<0) +#define BLOOM_DELETED (2<<0) + +/* + * The page ID is for the convenience of pg_filedump and similar utilities, + * which otherwise would have a hard time telling pages of different index + * types apart. It should be the last 2 bytes on the page. This is more or + * less "free" due to alignment considerations. + * + * See comments above GinPageOpaqueData. + */ +#define BLOOM_PAGE_ID 0xFF83 + +/* Macros for accessing bloom page structures */ +#define BloomPageGetOpaque(page) ((BloomPageOpaque) PageGetSpecialPointer(page)) +#define BloomPageGetMaxOffset(page) (BloomPageGetOpaque(page)->maxoff) +#define BloomPageIsMeta(page) \ + ((BloomPageGetOpaque(page)->flags & BLOOM_META) != 0) +#define BloomPageIsDeleted(page) \ + ((BloomPageGetOpaque(page)->flags & BLOOM_DELETED) != 0) +#define BloomPageSetDeleted(page) \ + (BloomPageGetOpaque(page)->flags |= BLOOM_DELETED) +#define BloomPageSetNonDeleted(page) \ + (BloomPageGetOpaque(page)->flags &= ~BLOOM_DELETED) +#define BloomPageGetData(page) ((BloomTuple *)PageGetContents(page)) +#define BloomPageGetTuple(state, page, offset) \ + ((BloomTuple *)(PageGetContents(page) \ + + (state)->sizeOfBloomTuple * ((offset) - 1))) +#define BloomPageGetNextTuple(state, tuple) \ + ((BloomTuple *)((Pointer)(tuple) + (state)->sizeOfBloomTuple)) + +/* Preserved page numbers */ +#define BLOOM_METAPAGE_BLKNO (0) +#define BLOOM_HEAD_BLKNO (1) /* first data page */ + +/* + * We store Bloom signatures as arrays of uint16 words. + */ +typedef uint16 BloomSignatureWord; + +#define SIGNWORDBITS ((int) (BITS_PER_BYTE * sizeof(BloomSignatureWord))) + +/* + * Default and maximum Bloom signature length in bits. + */ +#define DEFAULT_BLOOM_LENGTH (5 * SIGNWORDBITS) +#define MAX_BLOOM_LENGTH (256 * SIGNWORDBITS) + +/* + * Default and maximum signature bits generated per index key. + */ +#define DEFAULT_BLOOM_BITS 2 +#define MAX_BLOOM_BITS (MAX_BLOOM_LENGTH - 1) + +/* Bloom index options */ +typedef struct BloomOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int bloomLength; /* length of signature in words (not bits!) */ + int bitSize[INDEX_MAX_KEYS]; /* # of bits generated for + * each index key */ +} BloomOptions; + +/* + * FreeBlockNumberArray - array of block numbers sized so that metadata fill + * all space in metapage. + */ +typedef BlockNumber FreeBlockNumberArray[ + MAXALIGN_DOWN( + BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(BloomPageOpaqueData)) + - MAXALIGN(sizeof(uint16) * 2 + sizeof(uint32) + sizeof(BloomOptions)) + ) / sizeof(BlockNumber) +]; + +/* Metadata of bloom index */ +typedef struct BloomMetaPageData +{ + uint32 magickNumber; + uint16 nStart; + uint16 nEnd; + BloomOptions opts; + FreeBlockNumberArray notFullPage; +} BloomMetaPageData; + +/* Magic number to distinguish bloom pages among anothers */ +#define BLOOM_MAGICK_NUMBER (0xDBAC0DED) + +/* Number of blocks numbers fit in BloomMetaPageData */ +#define BloomMetaBlockN (sizeof(FreeBlockNumberArray) / sizeof(BlockNumber)) + +#define BloomPageGetMeta(page) ((BloomMetaPageData *) PageGetContents(page)) + +typedef struct BloomState +{ + FmgrInfo hashFn[INDEX_MAX_KEYS]; + BloomOptions opts; /* copy of options on index's metapage */ + int32 nColumns; + + /* + * sizeOfBloomTuple is index-specific, and it depends on reloptions, so + * precompute it + */ + Size sizeOfBloomTuple; +} BloomState; + +#define BloomPageGetFreeSpace(state, page) \ + (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \ + - BloomPageGetMaxOffset(page) * (state)->sizeOfBloomTuple \ + - MAXALIGN(sizeof(BloomPageOpaqueData))) + +/* + * Tuples are very different from all other relations + */ +typedef struct BloomTuple +{ + ItemPointerData heapPtr; + BloomSignatureWord sign[FLEXIBLE_ARRAY_MEMBER]; +} BloomTuple; + +#define BLOOMTUPLEHDRSZ offsetof(BloomTuple, sign) + +/* Opaque data structure for bloom index scan */ +typedef struct BloomScanOpaqueData +{ + BloomSignatureWord *sign; /* Scan signature */ + BloomState state; +} BloomScanOpaqueData; + +typedef BloomScanOpaqueData *BloomScanOpaque; + +/* blutils.c */ +extern void _PG_init(void); +extern Datum blhandler(PG_FUNCTION_ARGS); +extern void initBloomState(BloomState *state, Relation index); +extern void BloomFillMetapage(Relation index, Page metaPage); +extern void BloomInitMetapage(Relation index); +extern void BloomInitPage(Page page, uint16 flags); +extern Buffer BloomNewBuffer(Relation index); +extern void signValue(BloomState *state, BloomSignatureWord *sign, Datum value, int attno); +extern BloomTuple *BloomFormTuple(BloomState *state, ItemPointer iptr, Datum *values, bool *isnull); +extern bool BloomPageAddItem(BloomState *state, Page page, BloomTuple *tuple); + +/* blvalidate.c */ +/* index access method interface functions */ +extern Datum blbuild(PG_FUNCTION_ARGS); +extern Datum blbuildempty(PG_FUNCTION_ARGS); +extern Datum blinsert(PG_FUNCTION_ARGS); +extern Datum blbeginscan(PG_FUNCTION_ARGS); +extern Datum blgettuple(PG_FUNCTION_ARGS); +extern Datum blgetbitmap(PG_FUNCTION_ARGS); +extern Datum blrescan(PG_FUNCTION_ARGS); +extern Datum blendscan(PG_FUNCTION_ARGS); +extern Datum blmarkpos(PG_FUNCTION_ARGS); +extern Datum blrestrpos(PG_FUNCTION_ARGS); +extern Datum blbulkdelete(PG_FUNCTION_ARGS); +extern Datum blvacuumcleanup(PG_FUNCTION_ARGS); +extern Datum bloptions(PG_FUNCTION_ARGS); + +#endif diff --git a/src/backend/access/bloom/blscan.c b/src/backend/access/bloom/blscan.c new file mode 100644 index 00000000000..dc4931b4078 --- /dev/null +++ b/src/backend/access/bloom/blscan.c @@ -0,0 +1,209 @@ +/*------------------------------------------------------------------------- + * + * blscan.c + * Bloom index scan functions. + * + * Copyright (c) 2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/bloom/blscan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relscan.h" +#include "pgstat.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +#include "bloom.h" + +/* + * Begin scan of bloom index. + */ +Datum +blbeginscan(PG_FUNCTION_ARGS) +{ + Relation r = (Relation) PG_GETARG_POINTER(0); + int nkeys = PG_GETARG_INT32(1); + int norderbys = PG_GETARG_INT32(2); + IndexScanDesc scan; + BloomScanOpaque so; + + scan = RelationGetIndexScan(r, nkeys, norderbys); + + so = (BloomScanOpaque) palloc(sizeof(BloomScanOpaqueData)); + initBloomState(&so->state, scan->indexRelation); + so->sign = NULL; + + scan->opaque = so; + + PG_RETURN_POINTER(scan); +} + +/* + * Rescan a bloom index. + */ +Datum +blrescan(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1); + BloomScanOpaque so = (BloomScanOpaque) scan->opaque; + + if (so->sign) + pfree(so->sign); + so->sign = NULL; + + if (scankey && scan->numberOfKeys > 0) + { + memmove(scan->keyData, scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + } + + PG_RETURN_VOID(); +} + +/* + * End scan of bloom index. + */ +Datum +blendscan(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + BloomScanOpaque so = (BloomScanOpaque) scan->opaque; + + if (so->sign) + pfree(so->sign); + so->sign = NULL; + + PG_RETURN_VOID(); +} + +/* + * Insert all matching tuples into a bitmap. + */ +Datum +blgetbitmap(PG_FUNCTION_ARGS) +{ + /* We ignore the second argument as we're returning a hash bitmap */ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + Node **bmNodeP = (Node **)PG_GETARG_POINTER(1); + int64 ntids = 0; + BlockNumber blkno = BLOOM_HEAD_BLKNO, + npages; + int i; + BufferAccessStrategy bas; + TIDBitmap *tbm; + BloomScanOpaque so = (BloomScanOpaque) scan->opaque; + + + /* + * GPDB specific code. Since GPDB also support StreamBitmap + * in bitmap index. So normally we need to create specific bitmap + * node in the amgetbitmap AM. + */ + Assert(bmNodeP); + if (*bmNodeP == NULL) + { + /* XXX should we use less than work_mem for this? */ + tbm = tbm_create(work_mem * 1024L); + *bmNodeP = (Node *) tbm; + } + else if (!IsA(*bmNodeP, TIDBitmap)) + elog(ERROR, "non btree bitmap"); + else + tbm = (TIDBitmap *)*bmNodeP; + + if (so->sign == NULL) + { + /* New search: have to calculate search signature */ + ScanKey skey = scan->keyData; + + so->sign = palloc0(sizeof(BloomSignatureWord) * so->state.opts.bloomLength); + + for (i = 0; i < scan->numberOfKeys; i++) + { + /* + * Assume bloom-indexable operators to be strict, so nothing could + * be found for NULL key. + */ + if (skey->sk_flags & SK_ISNULL) + { + pfree(so->sign); + so->sign = NULL; + return 0; + } + + /* Add next value to the signature */ + signValue(&so->state, so->sign, skey->sk_argument, + skey->sk_attno - 1); + + skey++; + } + } + + /* + * We're going to read the whole index. This is why we use appropriate + * buffer access strategy. + */ + bas = GetAccessStrategy(BAS_BULKREAD); + npages = RelationGetNumberOfBlocks(scan->indexRelation); + + for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) + { + Buffer buffer; + Page page; + + buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM, + blkno, RBM_NORMAL, bas); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + + /* thats a check for snapshot too old feature + https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=848ef42bb8c7909c9d7baa38178d4a209906e7c1 */ +#if 0 + TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page); +#endif + + if (!PageIsNew(page) && !BloomPageIsDeleted(page)) + { + OffsetNumber offset, + maxOffset = BloomPageGetMaxOffset(page); + + for (offset = 1; offset <= maxOffset; offset++) + { + BloomTuple *itup = BloomPageGetTuple(&so->state, page, offset); + bool res = true; + + /* Check index signature with scan signature */ + for (i = 0; i < so->state.opts.bloomLength; i++) + { + if ((itup->sign[i] & so->sign[i]) != so->sign[i]) + { + res = false; + break; + } + } + + /* Add matching tuples to bitmap */ + if (res) + { + tbm_add_tuples(tbm, &itup->heapPtr, 1, true); + ntids++; + } + } + } + + UnlockReleaseBuffer(buffer); + CHECK_FOR_INTERRUPTS(); + } + FreeAccessStrategy(bas); + + PG_RETURN_INT64(ntids); +} diff --git a/src/backend/access/bloom/blutils.c b/src/backend/access/bloom/blutils.c new file mode 100644 index 00000000000..73d54bf60eb --- /dev/null +++ b/src/backend/access/bloom/blutils.c @@ -0,0 +1,439 @@ +/*------------------------------------------------------------------------- + * + * blutils.c + * Bloom index utilities. + * + * Portions Copyright (c) 2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1990-1993, Regents of the University of California + * + * IDENTIFICATION + * contrib/bloom/blutils.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/index.h" +#include "storage/lmgr.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/indexfsm.h" +#include "utils/memutils.h" +#include "access/reloptions.h" +#include "storage/freespace.h" +#include "storage/indexfsm.h" + +#include "bloom.h" + +/* Signature dealing macros - note i is assumed to be of type int */ +#define GETWORD(x,i) ( *( (BloomSignatureWord *)(x) + ( (i) / SIGNWORDBITS ) ) ) +#define CLRBIT(x,i) GETWORD(x,i) &= ~( 0x01 << ( (i) % SIGNWORDBITS ) ) +#define SETBIT(x,i) GETWORD(x,i) |= ( 0x01 << ( (i) % SIGNWORDBITS ) ) +#define GETBIT(x,i) ( (GETWORD(x,i) >> ( (i) % SIGNWORDBITS )) & 0x01 ) + +PG_FUNCTION_INFO_V1(blhandler); + +/* Kind of relation options for bloom index */ +static relopt_kind bl_relopt_kind; + +/* parse table for fillRelOptions */ +static relopt_parse_elt bl_relopt_tab[INDEX_MAX_KEYS + 1]; + +static int32 myRand(void); +static void mySrand(uint32 seed); + +/* + * Module initialize function: initialize info about Bloom relation options. + * + * Note: keep this in sync with makeDefaultBloomOptions(). + */ +void +_PG_init(void) +{ + int i; + char buf[16]; + + bl_relopt_kind = add_reloption_kind(); + + /* Option for length of signature */ + add_int_reloption(bl_relopt_kind, "length", + "Length of signature in bits", + DEFAULT_BLOOM_LENGTH, 1, MAX_BLOOM_LENGTH); + bl_relopt_tab[0].optname = "length"; + bl_relopt_tab[0].opttype = RELOPT_TYPE_INT; + bl_relopt_tab[0].offset = offsetof(BloomOptions, bloomLength); + + /* Number of bits for each possible index column: col1, col2, ... */ + for (i = 0; i < INDEX_MAX_KEYS; i++) + { + snprintf(buf, sizeof(buf), "col%d", i + 1); + add_int_reloption(bl_relopt_kind, buf, + "Number of bits generated for each index column", + DEFAULT_BLOOM_BITS, 1, MAX_BLOOM_BITS); + bl_relopt_tab[i + 1].optname = MemoryContextStrdup(TopMemoryContext, + buf); + bl_relopt_tab[i + 1].opttype = RELOPT_TYPE_INT; + bl_relopt_tab[i + 1].offset = offsetof(BloomOptions, bitSize[i]); + } +} + +/* + * Construct a default set of Bloom options. + */ +static BloomOptions * +makeDefaultBloomOptions(void) +{ + BloomOptions *opts; + int i; + + opts = (BloomOptions *) palloc0(sizeof(BloomOptions)); + /* Convert DEFAULT_BLOOM_LENGTH from # of bits to # of words */ + opts->bloomLength = (DEFAULT_BLOOM_LENGTH + SIGNWORDBITS - 1) / SIGNWORDBITS; + for (i = 0; i < INDEX_MAX_KEYS; i++) + opts->bitSize[i] = DEFAULT_BLOOM_BITS; + SET_VARSIZE(opts, sizeof(BloomOptions)); + return opts; +} + +/* + * Fill BloomState structure for particular index. + */ +void +initBloomState(BloomState *state, Relation index) +{ + int i; + + state->nColumns = index->rd_att->natts; + + /* Initialize hash function for each attribute */ + for (i = 0; i < index->rd_att->natts; i++) + { + fmgr_info_copy(&(state->hashFn[i]), + index_getprocinfo(index, i + 1, BLOOM_HASH_PROC), + CurrentMemoryContext); + } + + /* Initialize amcache if needed with options from metapage */ + if (!index->rd_amcache) + { + Buffer buffer; + Page page; + BloomMetaPageData *meta; + BloomOptions *opts; + + opts = MemoryContextAlloc(index->rd_indexcxt, sizeof(BloomOptions)); + + buffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + page = BufferGetPage(buffer); + + if (!BloomPageIsMeta(page)) + elog(ERROR, "Relation is not a bloom index"); + meta = BloomPageGetMeta(BufferGetPage(buffer)); + + if (meta->magickNumber != BLOOM_MAGICK_NUMBER) + elog(ERROR, "Relation is not a bloom index"); + + *opts = meta->opts; + + UnlockReleaseBuffer(buffer); + + index->rd_amcache = (void *) opts; + } + + memcpy(&state->opts, index->rd_amcache, sizeof(state->opts)); + state->sizeOfBloomTuple = BLOOMTUPLEHDRSZ + + sizeof(BloomSignatureWord) * state->opts.bloomLength; +} + +/* + * Random generator copied from FreeBSD. Using own random generator here for + * two reasons: + * + * 1) In this case random numbers are used for on-disk storage. Usage of + * PostgreSQL number generator would obstruct it from all possible changes. + * 2) Changing seed of PostgreSQL random generator would be undesirable side + * effect. + */ +static int32 next; + +static int32 +myRand(void) +{ + /*---------- + * Compute x = (7^5 * x) mod (2^31 - 1) + * without overflowing 31 bits: + * (2^31 - 1) = 127773 * (7^5) + 2836 + * From "Random number generators: good ones are hard to find", + * Park and Miller, Communications of the ACM, vol. 31, no. 10, + * October 1988, p. 1195. + *---------- + */ + int32 hi, + lo, + x; + + /* Must be in [1, 0x7ffffffe] range at this point. */ + hi = next / 127773; + lo = next % 127773; + x = 16807 * lo - 2836 * hi; + if (x < 0) + x += 0x7fffffff; + next = x; + /* Transform to [0, 0x7ffffffd] range. */ + return (x - 1); +} + +static void +mySrand(uint32 seed) +{ + next = seed; + /* Transform to [1, 0x7ffffffe] range. */ + next = (next % 0x7ffffffe) + 1; +} + +/* + * Add bits of given value to the signature. + */ +void +signValue(BloomState *state, BloomSignatureWord *sign, Datum value, int attno) +{ + uint32 hashVal; + int nBit, + j; + + /* + * init generator with "column's" number to get "hashed" seed for new + * value. We don't want to map the same numbers from different columns + * into the same bits! + */ + mySrand(attno); + + /* + * Init hash sequence to map our value into bits. the same values in + * different columns will be mapped into different bits because of step + * above + */ + hashVal = DatumGetInt32(FunctionCall1(&state->hashFn[attno], value)); + mySrand(hashVal ^ myRand()); + + for (j = 0; j < state->opts.bitSize[attno]; j++) + { + /* prevent multiple evaluation in SETBIT macro */ + nBit = myRand() % (state->opts.bloomLength * SIGNWORDBITS); + SETBIT(sign, nBit); + } +} + +/* + * Make bloom tuple from values. + */ +BloomTuple * +BloomFormTuple(BloomState *state, ItemPointer iptr, Datum *values, bool *isnull) +{ + int i; + BloomTuple *res = (BloomTuple *) palloc0(state->sizeOfBloomTuple); + + res->heapPtr = *iptr; + + /* Blooming each column */ + for (i = 0; i < state->nColumns; i++) + { + /* skip nulls */ + if (isnull[i]) + continue; + + signValue(state, res->sign, values[i], i); + } + + return res; +} + +/* + * Add new bloom tuple to the page. Returns true if new tuple was successfully + * added to the page. Returns false if it doesn't fit on the page. + */ +bool +BloomPageAddItem(BloomState *state, Page page, BloomTuple *tuple) +{ + BloomTuple *itup; + BloomPageOpaque opaque; + Pointer ptr; + + /* We shouldn't be pointed to an invalid page */ + Assert(!PageIsNew(page) && !BloomPageIsDeleted(page)); + + /* Does new tuple fit on the page? */ + if (BloomPageGetFreeSpace(state, page) < state->sizeOfBloomTuple) + return false; + + /* Copy new tuple to the end of page */ + opaque = BloomPageGetOpaque(page); + itup = BloomPageGetTuple(state, page, opaque->maxoff + 1); + memcpy((Pointer) itup, (Pointer) tuple, state->sizeOfBloomTuple); + + /* Adjust maxoff and pd_lower */ + opaque->maxoff++; + ptr = (Pointer) BloomPageGetTuple(state, page, opaque->maxoff + 1); + ((PageHeader) page)->pd_lower = ptr - page; + + /* Assert we didn't overrun available space */ + Assert(((PageHeader) page)->pd_lower <= ((PageHeader) page)->pd_upper); + + return true; +} + +/* + * Allocate a new page (either by recycling, or by extending the index file) + * The returned buffer is already pinned and exclusive-locked + * Caller is responsible for initializing the page by calling BloomInitBuffer + */ +Buffer +BloomNewBuffer(Relation index) +{ + Buffer buffer; + bool needLock; + + /* First, try to get a page from FSM */ + for (;;) + { + BlockNumber blkno = GetFreeIndexPage(index); + + if (blkno == InvalidBlockNumber) + break; + + buffer = ReadBuffer(index, blkno); + + /* + * We have to guard against the possibility that someone else already + * recycled this page; the buffer may be locked if so. + */ + if (ConditionalLockBuffer(buffer)) + { + Page page = BufferGetPage(buffer); + + if (PageIsNew(page)) + return buffer; /* OK to use, if never initialized */ + + if (BloomPageIsDeleted(page)) + return buffer; /* OK to use */ + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } + + /* Can't use it, so release buffer and try again */ + ReleaseBuffer(buffer); + } + + /* Must extend the file */ + needLock = !RELATION_IS_LOCAL(index); + if (needLock) + LockRelationForExtension(index, ExclusiveLock); + + buffer = ReadBuffer(index, P_NEW); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + if (needLock) + UnlockRelationForExtension(index, ExclusiveLock); + + return buffer; +} + +/* + * Initialize any page of a bloom index. + */ +void +BloomInitPage(Page page, uint16 flags) +{ + BloomPageOpaque opaque; + + PageInit(page, BLCKSZ, sizeof(BloomPageOpaqueData)); + + opaque = BloomPageGetOpaque(page); + memset(opaque, 0, sizeof(BloomPageOpaqueData)); + opaque->flags = flags; + opaque->bloom_page_id = BLOOM_PAGE_ID; +} + +/* + * Fill in metapage for bloom index. + */ +void +BloomFillMetapage(Relation index, Page metaPage) +{ + BloomOptions *opts; + BloomMetaPageData *metadata; + + /* + * Choose the index's options. If reloptions have been assigned, use + * those, otherwise create default options. + */ + opts = (BloomOptions *) index->rd_options; + if (!opts) + opts = makeDefaultBloomOptions(); + + /* + * Initialize contents of meta page, including a copy of the options, + * which are now frozen for the life of the index. + */ + BloomInitPage(metaPage, BLOOM_META); + metadata = BloomPageGetMeta(metaPage); + memset(metadata, 0, sizeof(BloomMetaPageData)); + metadata->magickNumber = BLOOM_MAGICK_NUMBER; + metadata->opts = *opts; + ((PageHeader) metaPage)->pd_lower += sizeof(BloomMetaPageData); + + /* If this fails, probably FreeBlockNumberArray size calc is wrong: */ + Assert(((PageHeader) metaPage)->pd_lower <= ((PageHeader) metaPage)->pd_upper); +} + +/* + * Initialize metapage for bloom index. + */ +void +BloomInitMetapage(Relation index) +{ + Buffer metaBuffer; + Page metaPage; + /* + * Make a new page; since it is first page it should be associated with + * block number 0 (BLOOM_METAPAGE_BLKNO). + */ + metaBuffer = BloomNewBuffer(index); + Assert(BufferGetBlockNumber(metaBuffer) == BLOOM_METAPAGE_BLKNO); + + /* Initialize contents of meta page */ + metaPage = BufferGetPage(metaBuffer); + BloomFillMetapage(index, metaPage); + + + MarkBufferDirty(metaBuffer); + UnlockReleaseBuffer(metaBuffer); +} + +/* + * Parse reloptions for bloom index, producing a BloomOptions struct. + */ +Datum +bloptions(PG_FUNCTION_ARGS) +{ + Datum reloptions = PG_GETARG_DATUM(0); + bool validate = PG_GETARG_BOOL(1); + relopt_value *options; + int numoptions; + BloomOptions *rdopts; + + /* Parse the user-given reloptions */ + options = parseRelOptions(reloptions, validate, bl_relopt_kind, &numoptions); + rdopts = allocateReloptStruct(sizeof(BloomOptions), options, numoptions); + fillRelOptions((void *) rdopts, sizeof(BloomOptions), options, numoptions, + validate, bl_relopt_tab, lengthof(bl_relopt_tab)); + + /* Convert signature length from # of bits to # to words, rounding up */ + rdopts->bloomLength = (rdopts->bloomLength + SIGNWORDBITS - 1) / SIGNWORDBITS; + + if (rdopts) + PG_RETURN_BYTEA_P((bytea *) rdopts); + PG_RETURN_NULL(); +} diff --git a/src/backend/access/bloom/blvacuum.c b/src/backend/access/bloom/blvacuum.c new file mode 100644 index 00000000000..cea7a426c73 --- /dev/null +++ b/src/backend/access/bloom/blvacuum.c @@ -0,0 +1,226 @@ +/*------------------------------------------------------------------------- + * + * blvacuum.c + * Bloom VACUUM functions. + * + * Copyright (c) 2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/bloom/blvacuum.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "bloom.h" +#include "catalog/storage.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "postmaster/autovacuum.h" +#include "storage/bufmgr.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" + + +/* + * Bulk deletion of all index entries pointing to a set of heap tuples. + * The set of target tuples is specified via a callback routine that tells + * whether any given heap tuple (identified by ItemPointer) is being deleted. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ + +Datum +blbulkdelete(PG_FUNCTION_ARGS) +{ + IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); + IndexBulkDeleteResult* volatile result = + (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); + IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); + void *callback_state = (void *) PG_GETARG_POINTER(3); + Relation index = info->index; + BlockNumber blkno, + npages; + FreeBlockNumberArray notFullPage; + int countPage = 0; + BloomState state; + Buffer buffer; + Page page; + BloomMetaPageData *metaData; + +#if 0 + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); +#endif + + initBloomState(&state, index); + + /* + * Iterate over the pages. We don't care about concurrently added pages, + * they can't contain tuples to delete. + */ + npages = RelationGetNumberOfBlocks(index); + for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) + { + BloomTuple *itup, + *itupPtr, + *itupEnd; + + vacuum_delay_point(); + + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, info->strategy); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buffer); + + /* Ignore empty/deleted pages until blvacuumcleanup() */ + if (PageIsNew(page) || BloomPageIsDeleted(page)) + { + UnlockReleaseBuffer(buffer);; + continue; + } + + /* + * Iterate over the tuples. itup points to current tuple being + * scanned, itupPtr points to where to save next non-deleted tuple. + */ + itup = itupPtr = BloomPageGetTuple(&state, page, FirstOffsetNumber); + itupEnd = BloomPageGetTuple(&state, page, + OffsetNumberNext(BloomPageGetMaxOffset(page))); + while (itup < itupEnd) + { + /* Do we have to delete this tuple? */ + if (callback(&itup->heapPtr, callback_state)) + { + /* Yes; adjust count of tuples that will be left on page */ + BloomPageGetOpaque(page)->maxoff--; +#if 0 + stats->tuples_removed += 1; +#endif + } + else + { + /* No; copy it to itupPtr++, but skip copy if not needed */ + if (itupPtr != itup) + memmove((Pointer) itupPtr, (Pointer) itup, + state.sizeOfBloomTuple); + itupPtr = BloomPageGetNextTuple(&state, itupPtr); + } + + itup = BloomPageGetNextTuple(&state, itup); + } + + /* Assert that we counted correctly */ + Assert(itupPtr == BloomPageGetTuple(&state, page, + OffsetNumberNext(BloomPageGetMaxOffset(page)))); + + /* + * Add page to new notFullPage list if we will not mark page as + * deleted and there is free space on it + */ + if (BloomPageGetMaxOffset(page) != 0 && + BloomPageGetFreeSpace(&state, page) >= state.sizeOfBloomTuple && + countPage < BloomMetaBlockN) + notFullPage[countPage++] = blkno; + + /* Did we delete something? */ + if (itupPtr != itup) + { + /* Is it empty page now? */ + if (BloomPageGetMaxOffset(page) == 0) + BloomPageSetDeleted(page); + /* Adjust pg_lower */ + ((PageHeader) page)->pd_lower = (Pointer) itupPtr - page; + /* Finish WAL-logging */ + } + else + { + /* Didn't change anything: abort WAL-logging */ + } + + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); + } + + /* + * Update the metapage's notFullPage list with whatever we found. Our + * info could already be out of date at this point, but blinsert() will + * cope if so. + */ + buffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + page = BufferGetPage(buffer); + + metaData = BloomPageGetMeta(page); + memcpy(metaData->notFullPage, notFullPage, sizeof(BlockNumber) * countPage); + metaData->nStart = 0; + metaData->nEnd = countPage; + + MarkBufferDirty(buffer); + + UnlockReleaseBuffer(buffer); + + PG_RETURN_POINTER(result); +} + +/* + * Post-VACUUM cleanup. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +Datum +blvacuumcleanup(PG_FUNCTION_ARGS) +{ + IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); + IndexBulkDeleteResult *stats = + (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); + Relation index = info->index; + BlockNumber npages, + blkno; + + if (info->analyze_only) + PG_RETURN_POINTER(stats); + + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + /* + * Iterate over the pages: insert deleted pages into FSM and collect + * statistics. + */ + npages = RelationGetNumberOfBlocks(index); + stats->num_pages = npages; + stats->pages_free = 0; + stats->num_index_tuples = 0; + for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) + { + Buffer buffer; + Page page; + + vacuum_delay_point(); + + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, info->strategy); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = (Page) BufferGetPage(buffer); + + if (PageIsNew(page) || BloomPageIsDeleted(page)) + { + RecordFreeIndexPage(index, blkno); + stats->pages_free++; + } + else + { + stats->num_index_tuples += BloomPageGetMaxOffset(page); + } + + UnlockReleaseBuffer(buffer); + } + + IndexFreeSpaceMapVacuum(info->index); + + PG_RETURN_POINTER(stats); +} diff --git a/src/backend/access/bloom/expected/bloom.out b/src/backend/access/bloom/expected/bloom.out new file mode 100644 index 00000000000..77521f02869 --- /dev/null +++ b/src/backend/access/bloom/expected/bloom.out @@ -0,0 +1,213 @@ +CREATE EXTENSION bloom; +CREATE TABLE tst ( + i int4, + t text +); +INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,2000) i; +CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3); +ALTER INDEX bloomidx SET (length=80); +SET enable_seqscan=on; +SET enable_bitmapscan=off; +SET enable_indexscan=off; +SELECT count(*) FROM tst WHERE i = 7; + count +------- + 200 +(1 row) + +SELECT count(*) FROM tst WHERE t = '5'; + count +------- + 112 +(1 row) + +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + count +------- + 13 +(1 row) + +SET enable_seqscan=off; +SET enable_bitmapscan=on; +SET enable_indexscan=on; +EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7; + QUERY PLAN +------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tst + Recheck Cond: (i = 7) + -> Bitmap Index Scan on bloomidx + Index Cond: (i = 7) +(5 rows) + +EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE t = '5'; + QUERY PLAN +------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tst + Recheck Cond: (t = '5'::text) + -> Bitmap Index Scan on bloomidx + Index Cond: (t = '5'::text) +(5 rows) + +EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + QUERY PLAN +--------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tst + Recheck Cond: ((i = 7) AND (t = '5'::text)) + -> Bitmap Index Scan on bloomidx + Index Cond: ((i = 7) AND (t = '5'::text)) +(5 rows) + +SELECT count(*) FROM tst WHERE i = 7; + count +------- + 200 +(1 row) + +SELECT count(*) FROM tst WHERE t = '5'; + count +------- + 112 +(1 row) + +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + count +------- + 13 +(1 row) + +DELETE FROM tst; +INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,2000) i; +VACUUM ANALYZE tst; +SELECT count(*) FROM tst WHERE i = 7; + count +------- + 200 +(1 row) + +SELECT count(*) FROM tst WHERE t = '5'; + count +------- + 112 +(1 row) + +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + count +------- + 13 +(1 row) + +DELETE FROM tst WHERE i > 1 OR t = '5'; +VACUUM tst; +INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,2000) i; +SELECT count(*) FROM tst WHERE i = 7; + count +------- + 200 +(1 row) + +SELECT count(*) FROM tst WHERE t = '5'; + count +------- + 112 +(1 row) + +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + count +------- + 13 +(1 row) + +VACUUM FULL tst; +SELECT count(*) FROM tst WHERE i = 7; + count +------- + 200 +(1 row) + +SELECT count(*) FROM tst WHERE t = '5'; + count +------- + 112 +(1 row) + +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + count +------- + 13 +(1 row) + +-- Try an unlogged table too +CREATE UNLOGGED TABLE tstu ( + i int4, + t text +); +INSERT INTO tstu SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,2000) i; +CREATE INDEX bloomidxu ON tstu USING bloom (i, t) WITH (col2 = 4); +SET enable_seqscan=off; +SET enable_bitmapscan=on; +SET enable_indexscan=on; +EXPLAIN (COSTS OFF) SELECT count(*) FROM tstu WHERE i = 7; + QUERY PLAN +-------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tstu + Recheck Cond: (i = 7) + -> Bitmap Index Scan on bloomidxu + Index Cond: (i = 7) +(5 rows) + +EXPLAIN (COSTS OFF) SELECT count(*) FROM tstu WHERE t = '5'; + QUERY PLAN +-------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tstu + Recheck Cond: (t = '5'::text) + -> Bitmap Index Scan on bloomidxu + Index Cond: (t = '5'::text) +(5 rows) + +EXPLAIN (COSTS OFF) SELECT count(*) FROM tstu WHERE i = 7 AND t = '5'; + QUERY PLAN +--------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tstu + Recheck Cond: ((i = 7) AND (t = '5'::text)) + -> Bitmap Index Scan on bloomidxu + Index Cond: ((i = 7) AND (t = '5'::text)) +(5 rows) + +SELECT count(*) FROM tstu WHERE i = 7; + count +------- + 200 +(1 row) + +SELECT count(*) FROM tstu WHERE t = '5'; + count +------- + 112 +(1 row) + +SELECT count(*) FROM tstu WHERE i = 7 AND t = '5'; + count +------- + 13 +(1 row) + +RESET enable_seqscan; +RESET enable_bitmapscan; +RESET enable_indexscan; +-- Run amvalidator function on our opclasses +SELECT opcname, amvalidate(opc.oid) +FROM pg_opclass opc JOIN pg_am am ON am.oid = opcmethod +WHERE amname = 'bloom' +ORDER BY 1; + opcname | amvalidate +----------+------------ + int4_ops | t + text_ops | t +(2 rows) + diff --git a/src/backend/access/bloom/sql/bloom.sql b/src/backend/access/bloom/sql/bloom.sql new file mode 100644 index 00000000000..b2136c3a798 --- /dev/null +++ b/src/backend/access/bloom/sql/bloom.sql @@ -0,0 +1,84 @@ +CREATE EXTENSION bloom; + +CREATE TABLE tst ( + i int4, + t text +); + +INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,2000) i; +CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3); +ALTER INDEX bloomidx SET (length=80); + +SET enable_seqscan=on; +SET enable_bitmapscan=off; +SET enable_indexscan=off; + +SELECT count(*) FROM tst WHERE i = 7; +SELECT count(*) FROM tst WHERE t = '5'; +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + +SET enable_seqscan=off; +SET enable_bitmapscan=on; +SET enable_indexscan=on; + +EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7; +EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE t = '5'; +EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + +SELECT count(*) FROM tst WHERE i = 7; +SELECT count(*) FROM tst WHERE t = '5'; +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + +DELETE FROM tst; +INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,2000) i; +VACUUM ANALYZE tst; + +SELECT count(*) FROM tst WHERE i = 7; +SELECT count(*) FROM tst WHERE t = '5'; +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + +DELETE FROM tst WHERE i > 1 OR t = '5'; +VACUUM tst; +INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,2000) i; + +SELECT count(*) FROM tst WHERE i = 7; +SELECT count(*) FROM tst WHERE t = '5'; +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + +VACUUM FULL tst; + +SELECT count(*) FROM tst WHERE i = 7; +SELECT count(*) FROM tst WHERE t = '5'; +SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; + +-- Try an unlogged table too + +CREATE UNLOGGED TABLE tstu ( + i int4, + t text +); + +INSERT INTO tstu SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,2000) i; +CREATE INDEX bloomidxu ON tstu USING bloom (i, t) WITH (col2 = 4); + +SET enable_seqscan=off; +SET enable_bitmapscan=on; +SET enable_indexscan=on; + +EXPLAIN (COSTS OFF) SELECT count(*) FROM tstu WHERE i = 7; +EXPLAIN (COSTS OFF) SELECT count(*) FROM tstu WHERE t = '5'; +EXPLAIN (COSTS OFF) SELECT count(*) FROM tstu WHERE i = 7 AND t = '5'; + +SELECT count(*) FROM tstu WHERE i = 7; +SELECT count(*) FROM tstu WHERE t = '5'; +SELECT count(*) FROM tstu WHERE i = 7 AND t = '5'; + +RESET enable_seqscan; +RESET enable_bitmapscan; +RESET enable_indexscan; + +-- Run amvalidator function on our opclasses +SELECT opcname, amvalidate(opc.oid) +FROM pg_opclass opc JOIN pg_am am ON am.oid = opcmethod +WHERE amname = 'bloom' +ORDER BY 1; diff --git a/src/backend/access/bloom/t/001_wal.pl b/src/backend/access/bloom/t/001_wal.pl new file mode 100644 index 00000000000..79615228d2d --- /dev/null +++ b/src/backend/access/bloom/t/001_wal.pl @@ -0,0 +1,80 @@ +# Test generic xlog record work for bloom index replication. +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 31; + +my $node_master; +my $node_standby; + +# Run few queries on both master and standby and check their results match. +sub test_index_replay +{ + my ($test_name) = @_; + + # Wait for standby to catch up + my $applname = $node_standby->name; + my $caughtup_query = +"SELECT pg_current_xlog_location() <= write_location FROM pg_stat_replication WHERE application_name = '$applname';"; + $node_master->poll_query_until('postgres', $caughtup_query) + or die "Timed out while waiting for standby 1 to catch up"; + + my $queries = qq(SET enable_seqscan=off; +SET enable_bitmapscan=on; +SET enable_indexscan=on; +SELECT * FROM tst WHERE i = 0; +SELECT * FROM tst WHERE i = 3; +SELECT * FROM tst WHERE t = 'b'; +SELECT * FROM tst WHERE t = 'f'; +SELECT * FROM tst WHERE i = 3 AND t = 'c'; +SELECT * FROM tst WHERE i = 7 AND t = 'e'; +); + + # Run test queries and compare their result + my $master_result = $node_master->safe_psql("postgres", $queries); + my $standby_result = $node_standby->safe_psql("postgres", $queries); + + is($master_result, $standby_result, "$test_name: query result matches"); +} + +# Initialize master node +$node_master = get_new_node('master'); +$node_master->init(allows_streaming => 1); +$node_master->start; +my $backup_name = 'my_backup'; + +# Take backup +$node_master->backup($backup_name); + +# Create streaming standby linking to master +$node_standby = get_new_node('standby'); +$node_standby->init_from_backup($node_master, $backup_name, + has_streaming => 1); +$node_standby->start; + +# Create some bloom index on master +$node_master->safe_psql("postgres", "CREATE EXTENSION bloom;"); +$node_master->safe_psql("postgres", "CREATE TABLE tst (i int4, t text);"); +$node_master->safe_psql("postgres", +"INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;" +); +$node_master->safe_psql("postgres", + "CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3);"); + +# Test that queries give same result +test_index_replay('initial'); + +# Run 10 cycles of table modification. Run test queries after each modification. +for my $i (1 .. 10) +{ + $node_master->safe_psql("postgres", "DELETE FROM tst WHERE i = $i;"); + test_index_replay("delete $i"); + $node_master->safe_psql("postgres", "VACUUM tst;"); + test_index_replay("vacuum $i"); + my ($start, $end) = (100001 + ($i - 1) * 10000, 100000 + $i * 10000); + $node_master->safe_psql("postgres", +"INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series($start,$end) i;" + ); + test_index_replay("insert $i"); +} diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index d421ce8b0f3..8e85558883c 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -7815,3 +7815,36 @@ bmcostestimate(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + + +/* + * Estimate cost of bloom index scan. + */ +Datum +blcostestimate(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + IndexPath *path = (IndexPath *) PG_GETARG_POINTER(1); + double loop_count = PG_GETARG_FLOAT8(2); + Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(3); + Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(4); + Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(5); + double *indexCorrelation = (double *) PG_GETARG_POINTER(6); + IndexOptInfo *index = path->indexinfo; + GenericCosts costs; + + MemSet(&costs, 0, sizeof(costs)); + + /* We have to visit all index tuples anyway */ + costs.numIndexTuples = index->tuples; + + /* Use generic estimate */ + genericcostestimate(root, path, loop_count, &costs); + + *indexStartupCost = costs.indexStartupCost; + *indexTotalCost = costs.indexTotalCost; + *indexSelectivity = costs.indexSelectivity; + *indexCorrelation = costs.indexCorrelation; + + PG_RETURN_VOID(); +} diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index c53a7858e16..07540c5469f 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1399,9 +1399,9 @@ LookupOpclassInfo(Oid operatorClassOid, * this happens only once per opclass per backend. */ #if defined(CLOBBER_CACHE_ALWAYS) - opcentry->valid = false; #endif + opcentry->valid = false; if (opcentry->valid) return opcentry; diff --git a/src/include/catalog/pg_proc_gp.h b/src/include/catalog/pg_proc_gp.h index ac59442548a..85c3ea93084 100644 --- a/src/include/catalog/pg_proc_gp.h +++ b/src/include/catalog/pg_proc_gp.h @@ -374,7 +374,7 @@ DESCR("bitmap(internal)"); /* bmoptions(_text, bool) => bytea */ DATA(insert OID = 7197 ( bmoptions PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_ bmoptions _null_ _null_ _null_ n a )); -DESCR("btree(internal)"); +DESCR("bitmap(internal)"); /* AOCS functions. */ diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index 98eb27338ff..26da5b35991 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -203,6 +203,7 @@ extern Datum gistcostestimate(PG_FUNCTION_ARGS); extern Datum spgcostestimate(PG_FUNCTION_ARGS); extern Datum gincostestimate(PG_FUNCTION_ARGS); extern Datum bmcostestimate(PG_FUNCTION_ARGS); +extern Datum blcostestimate(PG_FUNCTION_ARGS); /* Functions in array_selfuncs.c */