StringInfo rest_call_with_lock(char *method, char *url, char *params, StringInfo postData, int64 mutex, bool shared, bool allowCancel) { CURL *curl; struct curl_slist *headers = NULL; char *errorbuff; StringInfo response = makeStringInfo(); CURLcode ret; int64 response_code; errorbuff = (char *) palloc0(CURL_ERROR_SIZE); curl = curl_easy_init(); if (curl) { headers = curl_slist_append(headers, "Transfer-Encoding:"); headers = curl_slist_append(headers, "Expect:"); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); curl_easy_setopt(curl, CURLOPT_FORBID_REUSE, 0L); /* allow connections to be reused */ if (allowCancel) { curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0); /* we want progress ... */ curl_easy_setopt(curl, CURLOPT_PROGRESSFUNCTION, curl_progress_func); /* to go here so we can detect a ^C within postgres */ } curl_easy_setopt(curl, CURLOPT_USERAGENT, "zombodb for PostgreSQL"); curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 0); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write_func); curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0); curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, errorbuff); curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); curl_easy_setopt(curl, CURLOPT_TIMEOUT, 60 * 60L); /* timeout of 60 minutes */ curl_easy_setopt(curl, CURLOPT_URL, url); curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, method); curl_easy_setopt(curl, CURLOPT_WRITEDATA, response); curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, postData ? postData->len : 0); curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postData ? postData->data : NULL); curl_easy_setopt(curl, CURLOPT_POST, (strcmp(method, "POST") == 0) || (strcmp(method, "GET") != 0 && postData && postData->data) ? 1 : 0); } else { elog(IsTransactionState() ? ERROR : WARNING, "Unable to initialize libcurl"); } // if (mutex != 0) // { // if (shared) DirectFunctionCall1(pg_advisory_lock_shared_int8, Int64GetDatum(mutex)); // else DirectFunctionCall1(pg_advisory_lock_int8, Int64GetDatum(mutex)); // } ret = curl_easy_perform(curl); // if (mutex != 0) // { // if (shared) DirectFunctionCall1(pg_advisory_unlock_shared_int8, Int64GetDatum(mutex)); // else DirectFunctionCall1(pg_advisory_unlock_int8, Int64GetDatum(mutex)); // } if (allowCancel && IsTransactionState() && InterruptPending) { /* we might have detected one in the progress function, so check for sure */ CHECK_FOR_INTERRUPTS(); } if (ret != 0) { /* curl messed up */ elog(IsTransactionState() ? ERROR : WARNING, "libcurl error-code: %s(%d); message: %s; req=-X%s %s ", curl_easy_strerror(ret), ret, errorbuff, method, url); } curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); if (response_code < 200 || (response_code >=300 && response_code != 404)) { text *errorText = DatumGetTextP(DirectFunctionCall2(json_object_field_text, CStringGetTextDatum(response->data), CStringGetTextDatum("error"))); elog(IsTransactionState() ? ERROR : WARNING, "rc=%ld; %s", response_code, errorText != NULL ? TextDatumGetCString(errorText) : response->data); } if (headers) curl_slist_free_all(headers); curl_easy_cleanup(curl); pfree(errorbuff); return response; }
/* * Initialize the TABLESAMPLE Descriptor and the TABLESAMPLE Method. */ TableSampleDesc * tablesample_init(SampleScanState *scanstate, TableSampleClause *tablesample) { FunctionCallInfoData fcinfo; int i; List *args = tablesample->args; ListCell *arg; ExprContext *econtext = scanstate->ss.ps.ps_ExprContext; TableSampleDesc *tsdesc = (TableSampleDesc *) palloc0(sizeof(TableSampleDesc)); /* Load functions */ fmgr_info(tablesample->tsminit, &(tsdesc->tsminit)); fmgr_info(tablesample->tsmnextblock, &(tsdesc->tsmnextblock)); fmgr_info(tablesample->tsmnexttuple, &(tsdesc->tsmnexttuple)); if (OidIsValid(tablesample->tsmexaminetuple)) fmgr_info(tablesample->tsmexaminetuple, &(tsdesc->tsmexaminetuple)); else tsdesc->tsmexaminetuple.fn_oid = InvalidOid; fmgr_info(tablesample->tsmreset, &(tsdesc->tsmreset)); fmgr_info(tablesample->tsmend, &(tsdesc->tsmend)); InitFunctionCallInfoData(fcinfo, &tsdesc->tsminit, list_length(args) + 2, InvalidOid, NULL, NULL); tsdesc->tupDesc = scanstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor; tsdesc->heapScan = scanstate->ss.ss_currentScanDesc; /* First argument for init function is always TableSampleDesc */ fcinfo.arg[0] = PointerGetDatum(tsdesc); fcinfo.argnull[0] = false; /* * Second arg for init function is always REPEATABLE * When tablesample->repeatable is NULL then REPEATABLE clause was not * specified. * When specified, the expression cannot evaluate to NULL. */ if (tablesample->repeatable) { ExprState *argstate = ExecInitExpr((Expr *) tablesample->repeatable, (PlanState *) scanstate); fcinfo.arg[1] = ExecEvalExpr(argstate, econtext, &fcinfo.argnull[1], NULL); if (fcinfo.argnull[1]) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("REPEATABLE clause must be NOT NULL numeric value"))); } else { fcinfo.arg[1] = UInt32GetDatum(random()); fcinfo.argnull[1] = false; } /* Rest of the arguments come from user. */ i = 2; foreach(arg, args) { Expr *argexpr = (Expr *) lfirst(arg); ExprState *argstate = ExecInitExpr(argexpr, (PlanState *) scanstate); if (argstate == NULL) { fcinfo.argnull[i] = true; fcinfo.arg[i] = (Datum) 0;; } fcinfo.arg[i] = ExecEvalExpr(argstate, econtext, &fcinfo.argnull[i], NULL); i++; }
/* ---------------------------------------------------------------- * ExecInitGather * ---------------------------------------------------------------- */ GatherMergeState * ExecInitGatherMerge(GatherMerge *node, EState *estate, int eflags) { GatherMergeState *gm_state; Plan *outerNode; TupleDesc tupDesc; /* Gather merge node doesn't have innerPlan node. */ Assert(innerPlan(node) == NULL); /* * create state structure */ gm_state = makeNode(GatherMergeState); gm_state->ps.plan = (Plan *) node; gm_state->ps.state = estate; gm_state->ps.ExecProcNode = ExecGatherMerge; gm_state->initialized = false; gm_state->gm_initialized = false; gm_state->tuples_needed = -1; /* * Miscellaneous initialization * * create expression context for node */ ExecAssignExprContext(estate, &gm_state->ps); /* * GatherMerge doesn't support checking a qual (it's always more efficient * to do it in the child node). */ Assert(!node->plan.qual); /* * now initialize outer plan */ outerNode = outerPlan(node); outerPlanState(gm_state) = ExecInitNode(outerNode, estate, eflags); /* * Leader may access ExecProcNode result directly (if * need_to_scan_locally), or from workers via tuple queue. So we can't * trivially rely on the slot type being fixed for expressions evaluated * within this node. */ gm_state->ps.outeropsset = true; gm_state->ps.outeropsfixed = false; /* * Store the tuple descriptor into gather merge state, so we can use it * while initializing the gather merge slots. */ tupDesc = ExecGetResultType(outerPlanState(gm_state)); gm_state->tupDesc = tupDesc; /* * Initialize result type and projection. */ ExecInitResultTypeTL(&gm_state->ps); ExecConditionalAssignProjectionInfo(&gm_state->ps, tupDesc, OUTER_VAR); /* * Without projections result slot type is not trivially known, see * comment above. */ if (gm_state->ps.ps_ProjInfo == NULL) { gm_state->ps.resultopsset = true; gm_state->ps.resultopsfixed = false; } /* * initialize sort-key information */ if (node->numCols) { int i; gm_state->gm_nkeys = node->numCols; gm_state->gm_sortkeys = palloc0(sizeof(SortSupportData) * node->numCols); for (i = 0; i < node->numCols; i++) { SortSupport sortKey = gm_state->gm_sortkeys + i; sortKey->ssup_cxt = CurrentMemoryContext; sortKey->ssup_collation = node->collations[i]; sortKey->ssup_nulls_first = node->nullsFirst[i]; sortKey->ssup_attno = node->sortColIdx[i]; /* * We don't perform abbreviated key conversion here, for the same * reasons that it isn't used in MergeAppend */ sortKey->abbreviate = false; PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey); } } /* Now allocate the workspace for gather merge */ gather_merge_setup(gm_state); return gm_state; }
Datum lquery_in(PG_FUNCTION_ARGS) { char *buf = (char *) PG_GETARG_POINTER(0); char *ptr; int num = 0, totallen = 0, numOR = 0; int state = LQPRS_WAITLEVEL; lquery *result; nodeitem *lptr = NULL; lquery_level *cur, *curqlevel, *tmpql; lquery_variant *lrptr = NULL; bool hasnot = false; bool wasbad = false; int charlen; int pos = 0; ptr = buf; while (*ptr) { charlen = pg_mblen(ptr); if (charlen == 1) { if (t_iseq(ptr, '.')) num++; else if (t_iseq(ptr, '|')) numOR++; } ptr += charlen; } num++; if (num > MaxAllocSize / ITEMSIZE) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("number of levels (%d) exceeds the maximum allowed (%d)", num, (int) (MaxAllocSize / ITEMSIZE)))); curqlevel = tmpql = (lquery_level *) palloc0(ITEMSIZE * num); ptr = buf; while (*ptr) { charlen = pg_mblen(ptr); if (state == LQPRS_WAITLEVEL) { if (ISALNUM(ptr)) { GETVAR(curqlevel) = lptr = (nodeitem *) palloc0(sizeof(nodeitem) * (numOR + 1)); lptr->start = ptr; state = LQPRS_WAITDELIM; curqlevel->numvar = 1; } else if (charlen == 1 && t_iseq(ptr, '!')) { GETVAR(curqlevel) = lptr = (nodeitem *) palloc0(sizeof(nodeitem) * (numOR + 1)); lptr->start = ptr + 1; state = LQPRS_WAITDELIM; curqlevel->numvar = 1; curqlevel->flag |= LQL_NOT; hasnot = true; } else if (charlen == 1 && t_iseq(ptr, '*')) state = LQPRS_WAITOPEN; else UNCHAR; } else if (state == LQPRS_WAITVAR) { if (ISALNUM(ptr)) { lptr++; lptr->start = ptr; state = LQPRS_WAITDELIM; curqlevel->numvar++; } else UNCHAR; } else if (state == LQPRS_WAITDELIM) { if (charlen == 1 && t_iseq(ptr, '@')) { if (lptr->start == ptr) UNCHAR; lptr->flag |= LVAR_INCASE; curqlevel->flag |= LVAR_INCASE; } else if (charlen == 1 && t_iseq(ptr, '*')) { if (lptr->start == ptr) UNCHAR; lptr->flag |= LVAR_ANYEND; curqlevel->flag |= LVAR_ANYEND; } else if (charlen == 1 && t_iseq(ptr, '%')) { if (lptr->start == ptr) UNCHAR; lptr->flag |= LVAR_SUBLEXEME; curqlevel->flag |= LVAR_SUBLEXEME; } else if (charlen == 1 && t_iseq(ptr, '|')) { lptr->len = ptr - lptr->start - ((lptr->flag & LVAR_SUBLEXEME) ? 1 : 0) - ((lptr->flag & LVAR_INCASE) ? 1 : 0) - ((lptr->flag & LVAR_ANYEND) ? 1 : 0); if (lptr->wlen > 255) ereport(ERROR, (errcode(ERRCODE_NAME_TOO_LONG), errmsg("name of level is too long"), errdetail("Name length is %d, must " "be < 256, in position %d.", lptr->wlen, pos))); state = LQPRS_WAITVAR; } else if (charlen == 1 && t_iseq(ptr, '.')) { lptr->len = ptr - lptr->start - ((lptr->flag & LVAR_SUBLEXEME) ? 1 : 0) - ((lptr->flag & LVAR_INCASE) ? 1 : 0) - ((lptr->flag & LVAR_ANYEND) ? 1 : 0); if (lptr->wlen > 255) ereport(ERROR, (errcode(ERRCODE_NAME_TOO_LONG), errmsg("name of level is too long"), errdetail("Name length is %d, must " "be < 256, in position %d.", lptr->wlen, pos))); state = LQPRS_WAITLEVEL; curqlevel = NEXTLEV(curqlevel); } else if (ISALNUM(ptr)) { if (lptr->flag) UNCHAR; } else UNCHAR; } else if (state == LQPRS_WAITOPEN) { if (charlen == 1 && t_iseq(ptr, '{')) state = LQPRS_WAITFNUM; else if (charlen == 1 && t_iseq(ptr, '.')) { curqlevel->low = 0; curqlevel->high = 0xffff; curqlevel = NEXTLEV(curqlevel); state = LQPRS_WAITLEVEL; } else UNCHAR; } else if (state == LQPRS_WAITFNUM) { if (charlen == 1 && t_iseq(ptr, ',')) state = LQPRS_WAITSNUM; else if (t_isdigit(ptr)) { curqlevel->low = atoi(ptr); state = LQPRS_WAITND; } else UNCHAR; } else if (state == LQPRS_WAITSNUM) { if (t_isdigit(ptr)) { curqlevel->high = atoi(ptr); state = LQPRS_WAITCLOSE; } else if (charlen == 1 && t_iseq(ptr, '}')) { curqlevel->high = 0xffff; state = LQPRS_WAITEND; } else UNCHAR; } else if (state == LQPRS_WAITCLOSE) { if (charlen == 1 && t_iseq(ptr, '}')) state = LQPRS_WAITEND; else if (!t_isdigit(ptr)) UNCHAR; } else if (state == LQPRS_WAITND) { if (charlen == 1 && t_iseq(ptr, '}')) { curqlevel->high = curqlevel->low; state = LQPRS_WAITEND; } else if (charlen == 1 && t_iseq(ptr, ',')) state = LQPRS_WAITSNUM; else if (!t_isdigit(ptr)) UNCHAR; } else if (state == LQPRS_WAITEND) { if (charlen == 1 && t_iseq(ptr, '.')) { state = LQPRS_WAITLEVEL; curqlevel = NEXTLEV(curqlevel); } else UNCHAR; } else /* internal error */ elog(ERROR, "internal error in parser"); ptr += charlen; if (state == LQPRS_WAITDELIM) lptr->wlen++; pos++; } if (state == LQPRS_WAITDELIM) { if (lptr->start == ptr) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), errdetail("Unexpected end of line."))); lptr->len = ptr - lptr->start - ((lptr->flag & LVAR_SUBLEXEME) ? 1 : 0) - ((lptr->flag & LVAR_INCASE) ? 1 : 0) - ((lptr->flag & LVAR_ANYEND) ? 1 : 0); if (lptr->len == 0) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), errdetail("Unexpected end of line."))); if (lptr->wlen > 255) ereport(ERROR, (errcode(ERRCODE_NAME_TOO_LONG), errmsg("name of level is too long"), errdetail("Name length is %d, must " "be < 256, in position %d.", lptr->wlen, pos))); } else if (state == LQPRS_WAITOPEN) curqlevel->high = 0xffff; else if (state != LQPRS_WAITEND) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), errdetail("Unexpected end of line."))); curqlevel = tmpql; totallen = LQUERY_HDRSIZE; while ((char *) curqlevel - (char *) tmpql < num * ITEMSIZE) { totallen += LQL_HDRSIZE; if (curqlevel->numvar) { lptr = GETVAR(curqlevel); while (lptr - GETVAR(curqlevel) < curqlevel->numvar) { totallen += MAXALIGN(LVAR_HDRSIZE + lptr->len); lptr++; } } else if (curqlevel->low > curqlevel->high) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), errdetail("Low limit(%d) is greater than upper(%d).", curqlevel->low, curqlevel->high))); curqlevel = NEXTLEV(curqlevel); } result = (lquery *) palloc0(totallen); SET_VARSIZE(result, totallen); result->numlevel = num; result->firstgood = 0; result->flag = 0; if (hasnot) result->flag |= LQUERY_HASNOT; cur = LQUERY_FIRST(result); curqlevel = tmpql; while ((char *) curqlevel - (char *) tmpql < num * ITEMSIZE) { memcpy(cur, curqlevel, LQL_HDRSIZE); cur->totallen = LQL_HDRSIZE; if (curqlevel->numvar) { lrptr = LQL_FIRST(cur); lptr = GETVAR(curqlevel); while (lptr - GETVAR(curqlevel) < curqlevel->numvar) { cur->totallen += MAXALIGN(LVAR_HDRSIZE + lptr->len); lrptr->len = lptr->len; lrptr->flag = lptr->flag; lrptr->val = ltree_crc32_sz(lptr->start, lptr->len); memcpy(lrptr->name, lptr->start, lptr->len); lptr++; lrptr = LVAR_NEXT(lrptr); } pfree(GETVAR(curqlevel)); if (cur->numvar > 1 || cur->flag != 0) wasbad = true; else if (wasbad == false) (result->firstgood)++; } else wasbad = true; curqlevel = NEXTLEV(curqlevel); cur = LQL_NEXT(cur); } pfree(tmpql); PG_RETURN_POINTER(result); }
/* * get_relation_info - * Retrieves catalog information for a given relation. * * Given the Oid of the relation, return the following info into fields * of the RelOptInfo struct: * * min_attr lowest valid AttrNumber * max_attr highest valid AttrNumber * indexlist list of IndexOptInfos for relation's indexes * fdwroutine if it's a foreign table, the FDW function pointers * pages number of pages * tuples number of tuples * * Also, initialize the attr_needed[] and attr_widths[] arrays. In most * cases these are left as zeroes, but sometimes we need to compute attr * widths here, and we may as well cache the results for costsize.c. * * If inhparent is true, all we need to do is set up the attr arrays: * the RelOptInfo actually represents the appendrel formed by an inheritance * tree, and so the parent rel's physical size and index information isn't * important for it. */ void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel) { Index varno = rel->relid; Relation relation; bool hasindex; List *indexinfos = NIL; /* * We need not lock the relation since it was already locked, either by * the rewriter or when expand_inherited_rtentry() added it to the query's * rangetable. */ relation = heap_open(relationObjectId, NoLock); /* Temporary and unlogged relations are inaccessible during recovery. */ if (!RelationNeedsWAL(relation) && RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary or unlogged relations during recovery"))); rel->min_attr = FirstLowInvalidHeapAttributeNumber + 1; rel->max_attr = RelationGetNumberOfAttributes(relation); rel->reltablespace = RelationGetForm(relation)->reltablespace; Assert(rel->max_attr >= rel->min_attr); rel->attr_needed = (Relids *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); rel->attr_widths = (int32 *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); /* * Estimate relation size --- unless it's an inheritance parent, in which * case the size will be computed later in set_append_rel_pathlist, and we * must leave it zero for now to avoid bollixing the total_table_pages * calculation. */ if (!inhparent) estimate_rel_size(relation, rel->attr_widths - rel->min_attr, &rel->pages, &rel->tuples, &rel->allvisfrac); /* * Make list of indexes. Ignore indexes on system catalogs if told to. * Don't bother with indexes for an inheritance parent, either. */ if (inhparent || (IgnoreSystemIndexes && IsSystemRelation(relation))) hasindex = false; else hasindex = relation->rd_rel->relhasindex; if (hasindex) { List *indexoidlist; ListCell *l; LOCKMODE lmode; indexoidlist = RelationGetIndexList(relation); /* * For each index, we get the same type of lock that the executor will * need, and do not release it. This saves a couple of trips to the * shared lock manager while not creating any real loss of * concurrency, because no schema changes could be happening on the * index while we hold lock on the parent rel, and neither lock type * blocks any other kind of index operation. */ if (rel->relid == root->parse->resultRelation) lmode = RowExclusiveLock; else lmode = AccessShareLock; foreach(l, indexoidlist) { Oid indexoid = lfirst_oid(l); Relation indexRelation; Form_pg_index index; IndexOptInfo *info; int ncolumns; int i; /* * Extract info from the relation descriptor for the index. */ indexRelation = index_open(indexoid, lmode); index = indexRelation->rd_index; /* * Ignore invalid indexes, since they can't safely be used for * queries. Note that this is OK because the data structure we * are constructing is only used by the planner --- the executor * still needs to insert into "invalid" indexes, if they're marked * IndexIsReady. */ if (!IndexIsValid(index)) { index_close(indexRelation, NoLock); continue; } /* * If the index is valid, but cannot yet be used, ignore it; but * mark the plan we are generating as transient. See * src/backend/access/heap/README.HOT for discussion. */ if (index->indcheckxmin && !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data), TransactionXmin)) { root->glob->transientPlan = true; index_close(indexRelation, NoLock); continue; } info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; info->reltablespace = RelationGetForm(indexRelation)->reltablespace; info->rel = rel; info->ncolumns = ncolumns = index->indnatts; info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); info->indexcollations = (Oid *) palloc(sizeof(Oid) * ncolumns); info->opfamily = (Oid *) palloc(sizeof(Oid) * ncolumns); info->opcintype = (Oid *) palloc(sizeof(Oid) * ncolumns); for (i = 0; i < ncolumns; i++) { info->indexkeys[i] = index->indkey.values[i]; info->indexcollations[i] = indexRelation->rd_indcollation[i]; info->opfamily[i] = indexRelation->rd_opfamily[i]; info->opcintype[i] = indexRelation->rd_opcintype[i]; } info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->canreturn = index_can_return(indexRelation); info->amcanorderbyop = indexRelation->rd_am->amcanorderbyop; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; info->amsearcharray = indexRelation->rd_am->amsearcharray; info->amsearchnulls = indexRelation->rd_am->amsearchnulls; info->amhasgettuple = OidIsValid(indexRelation->rd_am->amgettuple); info->amhasgetbitmap = OidIsValid(indexRelation->rd_am->amgetbitmap); /* * Fetch the ordering information for the index, if any. */ if (info->relam == BTREE_AM_OID) { /* * If it's a btree index, we can use its opfamily OIDs * directly as the sort ordering opfamily OIDs. */ Assert(indexRelation->rd_am->amcanorder); info->sortopfamily = info->opfamily; info->reverse_sort = (bool *) palloc(sizeof(bool) * ncolumns); info->nulls_first = (bool *) palloc(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; } } else if (indexRelation->rd_am->amcanorder) { /* * Otherwise, identify the corresponding btree opfamilies by * trying to map this index's "<" operators into btree. Since * "<" uniquely defines the behavior of a sort order, this is * a sufficient test. * * XXX This method is rather slow and also requires the * undesirable assumption that the other index AM numbers its * strategies the same as btree. It'd be better to have a way * to explicitly declare the corresponding btree opfamily for * each opfamily of the other index type. But given the lack * of current or foreseeable amcanorder index types, it's not * worth expending more effort on now. */ info->sortopfamily = (Oid *) palloc(sizeof(Oid) * ncolumns); info->reverse_sort = (bool *) palloc(sizeof(bool) * ncolumns); info->nulls_first = (bool *) palloc(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; Oid ltopr; Oid btopfamily; Oid btopcintype; int16 btstrategy; info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; ltopr = get_opfamily_member(info->opfamily[i], info->opcintype[i], info->opcintype[i], BTLessStrategyNumber); if (OidIsValid(ltopr) && get_ordering_op_properties(ltopr, &btopfamily, &btopcintype, &btstrategy) && btopcintype == info->opcintype[i] && btstrategy == BTLessStrategyNumber) { /* Successful mapping */ info->sortopfamily[i] = btopfamily; } else { /* Fail ... quietly treat index as unordered */ info->sortopfamily = NULL; info->reverse_sort = NULL; info->nulls_first = NULL; break; } } } else { info->sortopfamily = NULL; info->reverse_sort = NULL; info->nulls_first = NULL; } /* * Fetch the index expressions and predicate, if any. We must * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); if (info->indexprs && varno != 1) ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); if (info->indpred && varno != 1) ChangeVarNodes((Node *) info->indpred, 1, varno, 0); /* Build targetlist using the completed indexprs data */ info->indextlist = build_index_tlist(root, info, relation); info->predOK = false; /* set later in indxpath.c */ info->unique = index->indisunique; info->immediate = index->indimmediate; info->hypothetical = false; /* * Estimate the index size. If it's not a partial index, we lock * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger * than the table. */ if (info->indpred == NIL) { info->pages = RelationGetNumberOfBlocks(indexRelation); info->tuples = rel->tuples; } else { double allvisfrac; /* dummy */ estimate_rel_size(indexRelation, NULL, &info->pages, &info->tuples, &allvisfrac); if (info->tuples > rel->tuples) info->tuples = rel->tuples; } if (info->relam == BTREE_AM_OID) { /* For btrees, get tree height while we have the index open */ info->tree_height = _bt_getrootheight(indexRelation); } else { /* For other index types, just set it to "unknown" for now */ info->tree_height = -1; } index_close(indexRelation, NoLock); indexinfos = lcons(info, indexinfos); } list_free(indexoidlist); }
/* * Construct an inner tuple containing the given prefix and node array */ SpGistInnerTuple spgFormInnerTuple(SpGistState *state, bool hasPrefix, Datum prefix, int nNodes, SpGistNodeTuple *nodes) { SpGistInnerTuple tup; unsigned int size; unsigned int prefixSize; int i; char *ptr; /* Compute size needed */ if (hasPrefix) prefixSize = SpGistGetTypeSize(&state->attPrefixType, prefix); else prefixSize = 0; size = SGITHDRSZ + prefixSize; /* Note: we rely on node tuple sizes to be maxaligned already */ for (i = 0; i < nNodes; i++) size += IndexTupleSize(nodes[i]); /* * Ensure that we can replace the tuple with a dead tuple later. This * test is unnecessary given current tuple layouts, but let's be safe. */ if (size < SGDTSIZE) size = SGDTSIZE; /* * Inner tuple should be small enough to fit on a page */ if (size > SPGIST_PAGE_CAPACITY - sizeof(ItemIdData)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("SP-GiST inner tuple size %zu exceeds maximum %zu", (Size) size, SPGIST_PAGE_CAPACITY - sizeof(ItemIdData)), errhint("Values larger than a buffer page cannot be indexed."))); /* * Check for overflow of header fields --- probably can't fail if the * above succeeded, but let's be paranoid */ if (size > SGITMAXSIZE || prefixSize > SGITMAXPREFIXSIZE || nNodes > SGITMAXNNODES) elog(ERROR, "SPGiST inner tuple header field is too small"); /* OK, form the tuple */ tup = (SpGistInnerTuple) palloc0(size); tup->nNodes = nNodes; tup->prefixSize = prefixSize; tup->size = size; if (hasPrefix) memcpyDatum(SGITDATAPTR(tup), &state->attPrefixType, prefix); ptr = (char *) SGITNODEPTR(tup); for (i = 0; i < nNodes; i++) { SpGistNodeTuple node = nodes[i]; memcpy(ptr, node, IndexTupleSize(node)); ptr += IndexTupleSize(node); } return tup; }
static Datum gp_aovisimap_entry_internal(PG_FUNCTION_ARGS, Oid aoRelOid) { Datum values[4]; bool nulls[4]; HeapTuple tuple; Datum result; typedef struct Context { AppendOnlyVisimap visiMap; Relation parentRelation; IndexScanDesc indexScan; text *bitmapBuffer; } Context; FuncCallContext *funcctx; Context *context; if (SRF_IS_FIRSTCALL()) { TupleDesc tupdesc; MemoryContext oldcontext; /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); /* * switch to memory context appropriate for multiple function * calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* build tupdesc for result tuples */ tupdesc = CreateTemplateTupleDesc(4, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "segno", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "first_row_num", INT8OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "hidden_tupcount", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 4, "bitmap", TEXTOID, -1, 0); funcctx->tuple_desc = BlessTupleDesc(tupdesc); /* * Collect all the locking information that we will format and send * out as a result set. */ context = (Context *) palloc0(sizeof(Context)); context->parentRelation = heap_open(aoRelOid, AccessShareLock); if (!(RelationIsAoRows(context->parentRelation) || RelationIsAoCols(context->parentRelation))) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Function not supported on relation"))); } AppendOnlyVisimap_Init(&context->visiMap, context->parentRelation->rd_appendonly->visimaprelid, context->parentRelation->rd_appendonly->visimapidxid, AccessShareLock, SnapshotNow); context->indexScan = AppendOnlyVisimapStore_BeginScan(& context->visiMap.visimapStore, 0, NULL); context->bitmapBuffer = palloc0(VARHDRSZ + APPENDONLY_VISIMAP_MAX_RANGE + 1); funcctx->user_fctx = (void *) context; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); context = (Context *) funcctx->user_fctx; if (AppendOnlyVisimapStore_GetNext(&context->visiMap.visimapStore, context->indexScan, ForwardScanDirection, &context->visiMap.visimapEntry, NULL)) { AppendOnlyVisimapEntry *visimapEntry = &context->visiMap.visimapEntry; MemSet(values, 0, sizeof(values)); MemSet(nulls, false, sizeof(nulls)); values[0] = Int32GetDatum(visimapEntry->segmentFileNum); values[1] = Int64GetDatum(visimapEntry->firstRowNum); values[2] = Int32GetDatum( (int32)AppendOnlyVisimapEntry_GetHiddenTupleCount(visimapEntry)); gp_aovisimap_encode_bitmap(VARDATA(context->bitmapBuffer), visimapEntry->bitmap); SET_VARSIZE(context->bitmapBuffer, APPENDONLY_VISIMAP_MAX_RANGE); values[3] = PointerGetDatum(context->bitmapBuffer); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } AppendOnlyVisimapStore_EndScan(&context->visiMap.visimapStore, context->indexScan); AppendOnlyVisimap_Finish(&context->visiMap, AccessShareLock); heap_close(context->parentRelation, AccessShareLock); pfree(context->bitmapBuffer); pfree(context); funcctx->user_fctx = NULL; SRF_RETURN_DONE(funcctx); }
static Datum gp_aovisimap_hidden_info_internal(PG_FUNCTION_ARGS, Oid aoRelOid) { Datum values[3]; bool nulls[3]; HeapTuple tuple; Datum result; typedef struct Context { AppendOnlyVisimap visiMap; Relation parentRelation; FileSegInfo **appendonlySegfileInfo; AOCSFileSegInfo **aocsSegfileInfo; int segfile_info_total; int i; } Context; FuncCallContext *funcctx; Context *context; if (SRF_IS_FIRSTCALL()) { TupleDesc tupdesc; MemoryContext oldcontext; /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); /* * switch to memory context appropriate for multiple function * calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* build tupdesc for result tuples */ tupdesc = CreateTemplateTupleDesc(3, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "segno", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "hidden_tupcount", INT8OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "total_tupcount", INT8OID, -1, 0); funcctx->tuple_desc = BlessTupleDesc(tupdesc); /* * Collect all the locking information that we will format and send * out as a result set. */ context = (Context *) palloc0(sizeof(Context)); context->parentRelation = heap_open(aoRelOid, AccessShareLock); if (!(RelationIsAoRows(context->parentRelation) || RelationIsAoCols(context->parentRelation))) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Function not supported on relation"))); } if (RelationIsAoRows(context->parentRelation)) { context->appendonlySegfileInfo = GetAllFileSegInfo( context->parentRelation, SnapshotNow, &context->segfile_info_total); } else { Assert(RelationIsAoCols(context->parentRelation)); context->aocsSegfileInfo = GetAllAOCSFileSegInfo(context->parentRelation, SnapshotNow, &context->segfile_info_total); } context->i = 0; AppendOnlyVisimap_Init(&context->visiMap, context->parentRelation->rd_appendonly->visimaprelid, context->parentRelation->rd_appendonly->visimapidxid, AccessShareLock, SnapshotNow); funcctx->user_fctx = (void *) context; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); context = (Context *) funcctx->user_fctx; while (context->i < context->segfile_info_total) { int64 tupcount; int segno; if (context->appendonlySegfileInfo) { FileSegInfo *fsinfo = context->appendonlySegfileInfo[context->i]; tupcount = fsinfo->total_tupcount; segno = fsinfo->segno; } else if (context->aocsSegfileInfo) { AOCSFileSegInfo *fsinfo = context->aocsSegfileInfo[context->i]; tupcount = fsinfo->total_tupcount; segno = fsinfo->segno; } else { Insist(false); } MemSet(values, 0, sizeof(values)); MemSet(nulls, false, sizeof(nulls)); values[0] = Int32GetDatum(segno); values[1] = Int64GetDatum(AppendOnlyVisimap_GetSegmentFileHiddenTupleCount( &context->visiMap, segno)); values[2] = Int64GetDatum(tupcount); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); context->i++; SRF_RETURN_NEXT(funcctx, result); } AppendOnlyVisimap_Finish(&context->visiMap, AccessShareLock); if (context->appendonlySegfileInfo) { FreeAllSegFileInfo(context->appendonlySegfileInfo, context->segfile_info_total); pfree(context->appendonlySegfileInfo); context->appendonlySegfileInfo = NULL; } if (context->aocsSegfileInfo) { FreeAllAOCSSegFileInfo(context->aocsSegfileInfo, context->segfile_info_total); pfree(context->aocsSegfileInfo); context->aocsSegfileInfo = NULL; } heap_close(context->parentRelation, AccessShareLock); pfree(context); funcctx->user_fctx = NULL; SRF_RETURN_DONE(funcctx); }
Datum ginbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation index = info->index; BlockNumber blkno = GIN_ROOT_BLKNO; GinVacuumState gvs; Buffer buffer; BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))]; uint32 nRoot; gvs.tmpCxt = AllocSetContextCreate(CurrentMemoryContext, "Gin vacuum temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); gvs.index = index; gvs.callback = callback; gvs.callback_state = callback_state; gvs.strategy = info->strategy; initGinState(&gvs.ginstate, index); /* first time through? */ if (stats == NULL) { /* Yes, so initialize stats to zeroes */ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* and cleanup any pending inserts */ ginInsertCleanup(&gvs.ginstate, true, stats); } /* we'll re-count the tuples each time */ stats->num_index_tuples = 0; gvs.result = stats; buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); /* find leaf page */ for (;;) { Page page = BufferGetPage(buffer); IndexTuple itup; LockBuffer(buffer, GIN_SHARE); Assert(!GinPageIsData(page)); if (GinPageIsLeaf(page)) { LockBuffer(buffer, GIN_UNLOCK); LockBuffer(buffer, GIN_EXCLUSIVE); if (blkno == GIN_ROOT_BLKNO && !GinPageIsLeaf(page)) { LockBuffer(buffer, GIN_UNLOCK); continue; /* check it one more */ } break; } Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); blkno = GinGetDownlink(itup); Assert(blkno != InvalidBlockNumber); UnlockReleaseBuffer(buffer); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); } /* right now we found leftmost page in entry's BTree */ for (;;) { Page page = BufferGetPage(buffer); Page resPage; uint32 i; Assert(!GinPageIsData(page)); resPage = ginVacuumEntryPage(&gvs, buffer, rootOfPostingTree, &nRoot); blkno = GinPageGetOpaque(page)->rightlink; if (resPage) { START_CRIT_SECTION(); PageRestoreTempPage(resPage, page); MarkBufferDirty(buffer); xlogVacuumPage(gvs.index, buffer); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); } else { UnlockReleaseBuffer(buffer); } vacuum_delay_point(); for (i = 0; i < nRoot; i++) { ginVacuumPostingTree(&gvs, rootOfPostingTree[i]); vacuum_delay_point(); } if (blkno == InvalidBlockNumber) /* rightmost page */ break; buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIN_EXCLUSIVE); } MemoryContextDelete(gvs.tmpCxt); PG_RETURN_POINTER(gvs.result); }
/* * scans posting tree and deletes empty pages */ static bool ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, DataPageDeleteStack *parent, OffsetNumber myoff) { DataPageDeleteStack *me; Buffer buffer; Page page; bool meDelete = FALSE; bool isempty; if (isRoot) { me = parent; } else { if (!parent->child) { me = (DataPageDeleteStack *) palloc0(sizeof(DataPageDeleteStack)); me->parent = parent; parent->child = me; me->leftBlkno = InvalidBlockNumber; } else me = parent->child; } buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, RBM_NORMAL, gvs->strategy); page = BufferGetPage(buffer); Assert(GinPageIsData(page)); if (!GinPageIsLeaf(page)) { OffsetNumber i; me->blkno = blkno; for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++) { PostingItem *pitem = GinDataPageGetPostingItem(page, i); if (ginScanToDelete(gvs, PostingItemGetBlockNumber(pitem), FALSE, me, i)) i--; } } if (GinPageIsLeaf(page)) isempty = GinDataLeafPageIsEmpty(page); else isempty = GinPageGetOpaque(page)->maxoff < FirstOffsetNumber; if (isempty) { /* we never delete the left- or rightmost branch */ if (me->leftBlkno != InvalidBlockNumber && !GinPageRightMost(page)) { Assert(!isRoot); ginDeletePage(gvs, blkno, me->leftBlkno, me->parent->blkno, myoff, me->parent->isRoot); meDelete = TRUE; } } ReleaseBuffer(buffer); if (!meDelete) me->leftBlkno = blkno; return meDelete; }
/* * Actually do a base backup for the specified tablespaces. * * This is split out mainly to avoid complaints about "variable might be * clobbered by longjmp" from stupider versions of gcc. */ static void perform_base_backup(basebackup_options *opt, DIR *tblspcdir) { XLogRecPtr startptr; TimeLineID starttli; XLogRecPtr endptr; TimeLineID endtli; char *labelfile; int datadirpathlen; datadirpathlen = strlen(DataDir); backup_started_in_recovery = RecoveryInProgress(); startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli, &labelfile); SendXlogRecPtrResult(startptr, starttli); /* * Calculate the relative path of temporary statistics directory * in order to skip the files which are located in that directory later. */ if (is_absolute_path(pgstat_stat_directory) && strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0) statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1); else if (strncmp(pgstat_stat_directory, "./", 2) != 0) statrelpath = psprintf("./%s", pgstat_stat_directory); else statrelpath = pgstat_stat_directory; PG_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0); { List *tablespaces = NIL; ListCell *lc; struct dirent *de; tablespaceinfo *ti; /* Collect information about all tablespaces */ while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL) { char fullpath[MAXPGPATH]; char linkpath[MAXPGPATH]; char *relpath = NULL; int rllen; /* Skip special stuff */ if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) continue; snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name); #if defined(HAVE_READLINK) || defined(WIN32) rllen = readlink(fullpath, linkpath, sizeof(linkpath)); if (rllen < 0) { ereport(WARNING, (errmsg("could not read symbolic link \"%s\": %m", fullpath))); continue; } else if (rllen >= sizeof(linkpath)) { ereport(WARNING, (errmsg("symbolic link \"%s\" target is too long", fullpath))); continue; } linkpath[rllen] = '\0'; /* * Relpath holds the relative path of the tablespace directory * when it's located within PGDATA, or NULL if it's located * elsewhere. */ if (rllen > datadirpathlen && strncmp(linkpath, DataDir, datadirpathlen) == 0 && IS_DIR_SEP(linkpath[datadirpathlen])) relpath = linkpath + datadirpathlen + 1; ti = palloc(sizeof(tablespaceinfo)); ti->oid = pstrdup(de->d_name); ti->path = pstrdup(linkpath); ti->rpath = relpath ? pstrdup(relpath) : NULL; ti->size = opt->progress ? sendTablespace(fullpath, true) : -1; tablespaces = lappend(tablespaces, ti); #else /* * If the platform does not have symbolic links, it should not be * possible to have tablespaces - clearly somebody else created * them. Warn about it and ignore. */ ereport(WARNING, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform"))); #endif } /* Add a node for the base directory at the end */ ti = palloc0(sizeof(tablespaceinfo)); ti->size = opt->progress ? sendDir(".", 1, true, tablespaces) : -1; tablespaces = lappend(tablespaces, ti); /* Send tablespace header */ SendBackupHeader(tablespaces); /* Send off our tablespaces one by one */ foreach(lc, tablespaces) { tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc); StringInfoData buf; /* Send CopyOutResponse message */ pq_beginmessage(&buf, 'H'); pq_sendbyte(&buf, 0); /* overall format */ pq_sendint(&buf, 0, 2); /* natts */ pq_endmessage(&buf); if (ti->path == NULL) { struct stat statbuf; /* In the main tar, include the backup_label first... */ sendFileWithContent(BACKUP_LABEL_FILE, labelfile); /* ... then the bulk of the files ... */ sendDir(".", 1, false, tablespaces); /* ... and pg_control after everything else. */ if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat control file \"%s\": %m", XLOG_CONTROL_FILE))); sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false); } else sendTablespace(ti->path, false); /* * If we're including WAL, and this is the main data directory we * don't terminate the tar stream here. Instead, we will append * the xlog files below and terminate it then. This is safe since * the main data directory is always sent *last*. */ if (opt->includewal && ti->path == NULL) { Assert(lnext(lc) == NULL); } else pq_putemptymessage('c'); /* CopyDone */ } }
/* ---------------------------------------------------------------- * ExecHashTableCreate * * create an empty hashtable data structure for hashjoin. * ---------------------------------------------------------------- */ HashJoinTable ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) { HashJoinTable hashtable; Plan *outerNode; int nbuckets; int nbatch; int num_skew_mcvs; int log2_nbuckets; int nkeys; int i; ListCell *ho; MemoryContext oldcxt; /* * Get information about the size of the relation to be hashed (it's the * "outer" subtree of this node, but the inner relation of the hashjoin). * Compute the appropriate size of the hash table. */ outerNode = outerPlan(node); ExecChooseHashTableSize(outerNode->plan_rows, outerNode->plan_width, OidIsValid(node->skewTable), &nbuckets, &nbatch, &num_skew_mcvs); #ifdef HJDEBUG printf("nbatch = %d, nbuckets = %d\n", nbatch, nbuckets); #endif /* nbuckets must be a power of 2 */ log2_nbuckets = my_log2(nbuckets); Assert(nbuckets == (1 << log2_nbuckets)); /* * Initialize the hash table control block. * * The hashtable control block is just palloc'd from the executor's * per-query memory context. */ hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData)); hashtable->nbuckets = nbuckets; hashtable->log2_nbuckets = log2_nbuckets; hashtable->buckets = NULL; hashtable->keepNulls = keepNulls; hashtable->skewEnabled = false; hashtable->skewBucket = NULL; hashtable->skewBucketLen = 0; hashtable->nSkewBuckets = 0; hashtable->skewBucketNums = NULL; hashtable->nbatch = nbatch; hashtable->curbatch = 0; hashtable->nbatch_original = nbatch; hashtable->nbatch_outstart = nbatch; hashtable->growEnabled = true; hashtable->totalTuples = 0; hashtable->innerBatchFile = NULL; hashtable->outerBatchFile = NULL; hashtable->spaceUsed = 0; hashtable->spacePeak = 0; hashtable->spaceAllowed = work_mem * 1024L; hashtable->spaceUsedSkew = 0; hashtable->spaceAllowedSkew = hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100; // cs3223, allocate memory for bitvector //printf(" Allocating memory for bitvector \n"); hashtable->bitvector = (int*) palloc0(bitvector_size*1024); // sizeof(int)/32 = 1/8 hashtable->numBVfilter = 0; hashtable->numProbNotJoin = 0; hashtable->firstCheck = 0; //printf("zero element: %d\n",hashtable->bitvector[0]); /* * Get info about the hash functions to be used for each hash key. Also * remember whether the join operators are strict. */ nkeys = list_length(hashOperators); hashtable->outer_hashfunctions = (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo)); hashtable->inner_hashfunctions = (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo)); hashtable->hashStrict = (bool *) palloc(nkeys * sizeof(bool)); i = 0; foreach(ho, hashOperators) { Oid hashop = lfirst_oid(ho); Oid left_hashfn; Oid right_hashfn; if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn)) elog(ERROR, "could not find hash function for hash operator %u", hashop); fmgr_info(left_hashfn, &hashtable->outer_hashfunctions[i]); fmgr_info(right_hashfn, &hashtable->inner_hashfunctions[i]); hashtable->hashStrict[i] = op_strict(hashop); i++; }
/* * setup_regexp_matches --- do the initial matching for regexp_matches() * or regexp_split() * * To avoid having to re-find the compiled pattern on each call, we do * all the matching in one swoop. The returned regexp_matches_ctx contains * the locations of all the substrings matching the pattern. * * The four bool parameters have only two patterns (one for matching, one for * splitting) but it seems clearer to distinguish the functionality this way * than to key it all off one "is_split" flag. We don't currently assume that * fetching_unmatched is exclusive of fetching the matched text too; if it's * set, the conversion buffer is large enough to fetch any single matched or * unmatched string, but not any larger substring. (In practice, when splitting * the matches are usually small anyway, and it didn't seem worth complicating * the code further.) */ static regexp_matches_ctx * setup_regexp_matches(text *orig_str, text *pattern, text *flags, Oid collation, bool force_glob, bool use_subpatterns, bool ignore_degenerate, bool fetching_unmatched) { regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx)); int eml = pg_database_encoding_max_length(); int orig_len; pg_wchar *wide_str; int wide_len; pg_re_flags re_flags; regex_t *cpattern; regmatch_t *pmatch; int pmatch_len; int array_len; int array_idx; int prev_match_end; int prev_valid_match_end; int start_search; int maxlen = 0; /* largest fetch length in characters */ /* save original string --- we'll extract result substrings from it */ matchctx->orig_str = orig_str; /* convert string to pg_wchar form for matching */ orig_len = VARSIZE_ANY_EXHDR(orig_str); wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1)); wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len); /* determine options */ parse_re_flags(&re_flags, flags); if (force_glob) { /* user mustn't specify 'g' for regexp_split */ if (re_flags.glob) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("regexp_split does not support the global option"))); /* but we find all the matches anyway */ re_flags.glob = true; } /* set up the compiled pattern */ cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation); /* do we want to remember subpatterns? */ if (use_subpatterns && cpattern->re_nsub > 0) { matchctx->npatterns = cpattern->re_nsub; pmatch_len = cpattern->re_nsub + 1; } else { use_subpatterns = false; matchctx->npatterns = 1; pmatch_len = 1; } /* temporary output space for RE package */ pmatch = palloc(sizeof(regmatch_t) * pmatch_len); /* * the real output space (grown dynamically if needed) * * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather * than at 2^27 */ array_len = re_flags.glob ? 255 : 31; matchctx->match_locs = (int *) palloc(sizeof(int) * array_len); array_idx = 0; /* search for the pattern, perhaps repeatedly */ prev_match_end = 0; prev_valid_match_end = 0; start_search = 0; while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search, pmatch_len, pmatch)) { /* * If requested, ignore degenerate matches, which are zero-length * matches occurring at the start or end of a string or just after a * previous match. */ if (!ignore_degenerate || (pmatch[0].rm_so < wide_len && pmatch[0].rm_eo > prev_match_end)) { /* enlarge output space if needed */ while (array_idx + matchctx->npatterns * 2 + 1 > array_len) { array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */ if (array_len > MaxAllocSize/sizeof(int)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("too many regular expression matches"))); matchctx->match_locs = (int *) repalloc(matchctx->match_locs, sizeof(int) * array_len); } /* save this match's locations */ if (use_subpatterns) { int i; for (i = 1; i <= matchctx->npatterns; i++) { int so = pmatch[i].rm_so; int eo = pmatch[i].rm_eo; matchctx->match_locs[array_idx++] = so; matchctx->match_locs[array_idx++] = eo; if (so >= 0 && eo >= 0 && (eo - so) > maxlen) maxlen = (eo - so); } } else { int so = pmatch[0].rm_so; int eo = pmatch[0].rm_eo; matchctx->match_locs[array_idx++] = so; matchctx->match_locs[array_idx++] = eo; if (so >= 0 && eo >= 0 && (eo - so) > maxlen) maxlen = (eo - so); } matchctx->nmatches++; /* * check length of unmatched portion between end of previous valid * (nondegenerate, or degenerate but not ignored) match and start * of current one */ if (fetching_unmatched && pmatch[0].rm_so >= 0 && (pmatch[0].rm_so - prev_valid_match_end) > maxlen) maxlen = (pmatch[0].rm_so - prev_valid_match_end); prev_valid_match_end = pmatch[0].rm_eo; } prev_match_end = pmatch[0].rm_eo; /* if not glob, stop after one match */ if (!re_flags.glob) break; /* * Advance search position. Normally we start the next search at the * end of the previous match; but if the match was of zero length, we * have to advance by one character, or we'd just find the same match * again. */ start_search = prev_match_end; if (pmatch[0].rm_so == pmatch[0].rm_eo) start_search++; if (start_search > wide_len) break; } /* * check length of unmatched portion between end of last match and end of * input string */ if (fetching_unmatched && (wide_len - prev_valid_match_end) > maxlen) maxlen = (wide_len - prev_valid_match_end); /* * Keep a note of the end position of the string for the benefit of * splitting code. */ matchctx->match_locs[array_idx] = wide_len; if (eml > 1) { int64 maxsiz = eml * (int64) maxlen; int conv_bufsiz; /* * Make the conversion buffer large enough for any substring of * interest. * * Worst case: assume we need the maximum size (maxlen*eml), but take * advantage of the fact that the original string length in bytes is an * upper bound on the byte length of any fetched substring (and we know * that len+1 is safe to allocate because the varlena header is longer * than 1 byte). */ if (maxsiz > orig_len) conv_bufsiz = orig_len + 1; else conv_bufsiz = maxsiz + 1; /* safe since maxsiz < 2^30 */ matchctx->conv_buf = palloc(conv_bufsiz); matchctx->conv_bufsiz = conv_bufsiz; matchctx->wide_str = wide_str; } else { /* No need to keep the wide string if we're in a single-byte charset. */ pfree(wide_str); matchctx->wide_str = NULL; matchctx->conv_buf = NULL; matchctx->conv_bufsiz = 0; } /* Clean up temp storage */ pfree(pmatch); return matchctx; }
/* * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation * * This routine vacuums a single heap, cleans out its indexes, and * updates its relpages and reltuples statistics. * * At entry, we have already established a transaction and opened * and locked the relation. * * The return value indicates whether this function has held off * interrupts -- caller must RESUME_INTERRUPTS() after commit if true. */ bool lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, BufferAccessStrategy bstrategy, List *updated_stats) { LVRelStats *vacrelstats; Relation *Irel; int nindexes; BlockNumber possibly_freeable; PGRUsage ru0; TimestampTz starttime = 0; bool heldoff = false; pg_rusage_init(&ru0); /* measure elapsed time iff autovacuum logging requires it */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration > 0) starttime = GetCurrentTimestamp(); if (vacstmt->verbose) elevel = INFO; else elevel = DEBUG2; if (Gp_role == GP_ROLE_DISPATCH) elevel = DEBUG2; /* vacuum and analyze messages aren't interesting from the QD */ #ifdef FAULT_INJECTOR if (vacuumStatement_IsInAppendOnlyDropPhase(vacstmt)) { FaultInjector_InjectFaultIfSet( CompactionBeforeSegmentFileDropPhase, DDLNotSpecified, "", // databaseName ""); // tableName } if (vacummStatement_IsInAppendOnlyCleanupPhase(vacstmt)) { FaultInjector_InjectFaultIfSet( CompactionBeforeCleanupPhase, DDLNotSpecified, "", // databaseName ""); // tableName } #endif /* * MPP-23647. Update xid limits for heap as well as appendonly * relations. This allows setting relfrozenxid to correct value * for an appendonly (AO/CO) table. */ vac_strategy = bstrategy; vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared, &OldestXmin, &FreezeLimit); /* * Execute the various vacuum operations. Appendonly tables are treated * differently. */ if (RelationIsAoRows(onerel) || RelationIsAoCols(onerel)) { lazy_vacuum_aorel(onerel, vacstmt, updated_stats); return false; } vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); /* heap relation */ /* Set threshold for interesting free space = average request size */ /* XXX should we scale it up or down? Adjust vacuum.c too, if so */ vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node); vacrelstats->num_index_scans = 0; /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); /* Do the vacuuming */ lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, updated_stats, vacstmt->extra_oids); /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); /* * Optionally truncate the relation. * * Don't even think about it unless we have a shot at releasing a goodly * number of pages. Otherwise, the time taken isn't worth it. * * Note that after we've truncated the heap, it's too late to abort the * transaction; doing so would lose the sinval messages needed to tell * the other backends about the table being shrunk. We prevent interrupts * in that case; caller is responsible for re-enabling them after * committing the transaction. */ possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; if (possibly_freeable > 0 && (possibly_freeable >= REL_TRUNCATE_MINIMUM || possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)) { HOLD_INTERRUPTS(); heldoff = true; lazy_truncate_heap(onerel, vacrelstats); } /* Update shared free space map with final free space info */ lazy_update_fsm(onerel, vacrelstats); if (vacrelstats->tot_free_pages > MaxFSMPages) ereport(WARNING, (errmsg("relation \"%s.%s\" contains more than \"max_fsm_pages\" pages with useful free space", get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel)), /* Only suggest VACUUM FULL if > 20% free */ (vacrelstats->tot_free_pages > vacrelstats->rel_pages * 0.20) ? errhint("Consider using VACUUM FULL on this relation or increasing the configuration parameter \"max_fsm_pages\".") : errhint("Consider increasing the configuration parameter \"max_fsm_pages\"."))); /* Update statistics in pg_class */ vac_update_relstats_from_list(onerel, vacrelstats->rel_pages, vacrelstats->rel_tuples, vacrelstats->hasindex, FreezeLimit, updated_stats); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, true /*vacrelstats->scanned_all*/, vacstmt->analyze, vacrelstats->rel_tuples); if (gp_indexcheck_vacuum == INDEX_CHECK_ALL || (gp_indexcheck_vacuum == INDEX_CHECK_SYSTEM && PG_CATALOG_NAMESPACE == RelationGetNamespace(onerel))) { int i; for (i = 0; i < nindexes; i++) { if (Irel[i]->rd_rel->relam == BTREE_AM_OID) _bt_validate_vacuum(Irel[i], onerel, OldestXmin); } } /* and log the action if appropriate */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) { if (Log_autovacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(), Log_autovacuum_min_duration)) ereport(LOG, (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n" "pages: %d removed, %d remain\n" "tuples: %.0f removed, %.0f remain\n" "system usage: %s", get_database_name(MyDatabaseId), get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel), vacrelstats->num_index_scans, vacrelstats->pages_removed, vacrelstats->rel_pages, vacrelstats->tuples_deleted, vacrelstats->rel_tuples, pg_rusage_show(&ru0)))); } return heldoff; }
/* * Initializes the dictionary for use in backends - checks whether such dictionary * and list of stopwords is already used, and if not then parses it and loads it into * the shared segment. * * This is called through dispell_init() which is responsible for proper locking * of the shared memory (using SegmentInfo->lock). */ static void init_shared_dict(DictInfo * info, char * dictFile, char * affFile, char * stopFile) { int size; SharedIspellDict * shdict = NULL; SharedStopList * shstop = NULL; IspellDict * dict; StopList stoplist; /* DICTIONARY + AFFIXES */ /* TODO This should probably check that the filenames are not NULL, and maybe that * it exists. Or maybe that's handled by the NIImport* functions. */ /* lookup if the dictionary (words and affixes) is already loaded in the shared segment */ shdict = get_shared_dict(dictFile, affFile); /* load the dictionary / affixes if not yet defined */ if (shdict == NULL) { dict = (IspellDict *)palloc0(sizeof(IspellDict)); NIStartBuild(dict); NIImportDictionary(dict, get_tsearch_config_filename(dictFile, "dict")); NIImportAffixes(dict, get_tsearch_config_filename(affFile, "affix")); NISortDictionary(dict); NISortAffixes(dict); NIFinishBuild(dict); /* check available space in shared segment */ size = sizeIspellDict(dict, dictFile, affFile); if (size > segment_info->available) elog(ERROR, "shared dictionary %s.dict / %s.affix needs %d B, only %ld B available", dictFile, affFile, size, segment_info->available); /* fine, there's enough space - copy the dictionary */ shdict = copyIspellDict(dict, dictFile, affFile, size, dict->nspell); elog(INFO, "shared dictionary %s.dict / %s.affix loaded, used %d B, %ld B remaining", dictFile, affFile, size, segment_info->available); /* add the new dictionary to the linked list (of SharedIspellDict structures) */ shdict->next = segment_info->dict; segment_info->dict = shdict; } /* STOP WORDS */ /* lookup if the stop words are already loaded in the shared segment, but only if there * actually is a list */ if (stopFile != NULL) { shstop = get_shared_stop_list(stopFile); /* load the stopwords if not yet defined */ if (shstop == NULL) { readstoplist(stopFile, &stoplist, lowerstr); size = sizeStopList(&stoplist, stopFile); if (size > segment_info->available) { elog(ERROR, "shared stoplist %s.stop needs %d B, only %ld B available", stopFile, size, segment_info->available); } /* fine, there's enough space - copy the stoplist */ shstop = copyStopList(&stoplist, stopFile, size); elog(INFO, "shared stoplist %s.stop loaded, used %d B, %ld B remaining", affFile, size, segment_info->available); /* add the new stopword list to the linked list (of SharedStopList structures) */ shstop->next = segment_info->stop; segment_info->stop = shstop; } } /* Now, fill the DictInfo structure for the backend (references to dictionary, * stopwords and the filenames). */ info->dict = shdict; info->stop = shstop; info->lookup = GetCurrentTimestamp(); memcpy(info->dictFile, dictFile, strlen(dictFile) + 1); memcpy(info->affixFile, dictFile, strlen(affFile)+ 1); memcpy(info->stopFile, dictFile, strlen(stopFile) + 1); }
/* * lazy_vacuum_aorel -- perform LAZY VACUUM for one Append-only relation. */ static void lazy_vacuum_aorel(Relation onerel, VacuumStmt *vacstmt, List *updated_stats) { LVRelStats *vacrelstats; bool update_relstats = true; vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); if (vacuumStatement_IsInAppendOnlyPreparePhase(vacstmt)) { elogif(Debug_appendonly_print_compaction, LOG, "Vacuum prepare phase %s", RelationGetRelationName(onerel)); vacuum_appendonly_indexes(onerel, vacstmt, updated_stats); if (RelationIsAoRows(onerel)) AppendOnlyTruncateToEOF(onerel); else AOCSTruncateToEOF(onerel); /* * MPP-23647. For empty tables, we skip compaction phase * and cleanup phase. Therefore, we update the stats * (specifically, relfrozenxid) in prepare phase if the * table is empty. Otherwise, the stats will be updated in * the cleanup phase, when we would have computed the * correct values for stats. */ if (vacstmt->appendonly_relation_empty) { update_relstats = true; /* * For an empty relation, the only stats we care about * is relfrozenxid and relhasindex. We need to be * mindful of correctly setting relhasindex here. * relfrozenxid is already taken care of above by * calling vacuum_set_xid_limits(). */ vacrelstats->hasindex = onerel->rd_rel->relhasindex; } else { /* * For a non-empty relation, follow the usual * compaction phases and do not update stats in * prepare phase. */ update_relstats = false; } } else if (!vacummStatement_IsInAppendOnlyCleanupPhase(vacstmt)) { vacuum_appendonly_rel(onerel, vacstmt); update_relstats = false; } else { elogif(Debug_appendonly_print_compaction, LOG, "Vacuum cleanup phase %s", RelationGetRelationName(onerel)); vacuum_appendonly_fill_stats(onerel, ActiveSnapshot, &vacrelstats->rel_pages, &vacrelstats->rel_tuples, &vacrelstats->hasindex); /* reset the remaining LVRelStats values */ vacrelstats->nonempty_pages = 0; vacrelstats->num_dead_tuples = 0; vacrelstats->max_dead_tuples = 0; vacrelstats->tuples_deleted = 0; vacrelstats->tot_free_pages = 0; vacrelstats->fs_is_heap = false; vacrelstats->num_free_pages = 0; vacrelstats->max_free_pages = 0; vacrelstats->pages_removed = 0; } if (update_relstats) { /* Update statistics in pg_class */ vac_update_relstats(onerel, vacrelstats->rel_pages, vacrelstats->rel_tuples, vacrelstats->hasindex, FreezeLimit, updated_stats); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, true /*vacrelstats->scanned_all*/, vacstmt->analyze, vacrelstats->rel_tuples); } }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, List *updated_stats, List *all_extra_oids) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; int reindex_count = 1; PGRUsage ru0; /* Fetch gp_persistent_relation_node information that will be added to XLOG record. */ RelationFetchGpRelationNodeForXLog(onerel); pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->nonempty_pages = 0; lazy_space_alloc(vacrelstats, nblocks); for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; vacuum_delay_point(); /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) { List *extra_oids = get_oids_for_bitmap(all_extra_oids, Irel[i], onerel, reindex_count); lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats, extra_oids); list_free(extra_oids); } reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; } /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBuffer(onerel, blkno); /* Initially, we only need shared access to the buffer */ LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* LockBufferForCleanup(buf)? */ if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); /* must record in xlog so that changetracking will know about this change */ log_heap_newpage(onerel, page, blkno); empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetFreeSpace(page)); } MarkBufferDirty(buf); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } if (PageIsEmpty(page)) { empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetFreeSpace(page)); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); ItemPointerSet(&(tuple.t_self), blkno, offnum); tupgone = false; switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf, false)) { case HEAPTUPLE_DEAD: tupgone = true; /* we can delete the tuple */ break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); tups_vacuumed += 1; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it * needs freezing. If we already froze anything, then * we've already switched the buffer lock to exclusive. */ if (heap_freeze_tuple(tuple.t_data, FreezeLimit, (nfrozen > 0) ? InvalidBuffer : buf)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); /* no XLOG for temp tables, though */ if (!onerel->rd_istemp) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Trade in buffer share lock for super-exclusive lock */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) { lazy_record_free_space(vacrelstats, blkno, PageGetFreeSpace(page)); } /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ } /* save stats for use later */ vacrelstats->rel_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) { List *extra_oids = get_oids_for_bitmap(all_extra_oids, Irel[i], onerel, reindex_count); lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats, extra_oids); list_free(extra_oids); } reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats, updated_stats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages contain useful free space.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, vacrelstats->tot_free_pages, empty_pages, pg_rusage_show(&ru0)))); if (vacrelstats->tot_free_pages > MaxFSMPages) ereport(WARNING, (errmsg("relation \"%s.%s\" contains more than \"max_fsm_pages\" pages with useful free space", get_namespace_name(RelationGetNamespace(onerel)), relname), errhint("Consider compacting this relation or increasing the configuration parameter \"max_fsm_pages\"."))); }
Datum ginvacuumcleanup(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); Relation index = info->index; bool needLock; BlockNumber npages, blkno; BlockNumber totFreePages; GinState ginstate; GinStatsData idxStat; /* * In an autovacuum analyze, we want to clean up pending insertions. * Otherwise, an ANALYZE-only call is a no-op. */ if (info->analyze_only) { if (IsAutoVacuumWorkerProcess()) { initGinState(&ginstate, index); ginInsertCleanup(&ginstate, true, stats); } PG_RETURN_POINTER(stats); } /* * Set up all-zero stats and cleanup pending inserts if ginbulkdelete * wasn't called */ if (stats == NULL) { stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); initGinState(&ginstate, index); ginInsertCleanup(&ginstate, true, stats); } memset(&idxStat, 0, sizeof(idxStat)); /* * XXX we always report the heap tuple count as the number of index * entries. This is bogus if the index is partial, but it's real hard to * tell how many distinct heap entries are referenced by a GIN index. */ stats->num_index_tuples = info->num_heap_tuples; stats->estimated_count = info->estimated_count; /* * Need lock unless it's local to this backend. */ needLock = !RELATION_IS_LOCAL(index); if (needLock) LockRelationForExtension(index, ExclusiveLock); npages = RelationGetNumberOfBlocks(index); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); totFreePages = 0; for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIN_SHARE); page = (Page) BufferGetPage(buffer); if (GinPageIsDeleted(page)) { Assert(blkno != GIN_ROOT_BLKNO); RecordFreeIndexPage(index, blkno); totFreePages++; } else if (GinPageIsData(page)) { idxStat.nDataPages++; } else if (!GinPageIsList(page)) { idxStat.nEntryPages++; if (GinPageIsLeaf(page)) idxStat.nEntries += PageGetMaxOffsetNumber(page); } UnlockReleaseBuffer(buffer); } /* Update the metapage with accurate page and entry counts */ idxStat.nTotalPages = npages; ginUpdateStats(info->index, &idxStat); /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); stats->pages_free = totFreePages; if (needLock) LockRelationForExtension(index, ExclusiveLock); stats->num_pages = RelationGetNumberOfBlocks(index); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); PG_RETURN_POINTER(stats); }
static Datum gp_aovisimap_internal(PG_FUNCTION_ARGS, Oid aoRelOid) { Datum values[3]; bool nulls[3]; HeapTuple tuple; Datum result; typedef struct Context { Relation aorel; AppendOnlyVisimapScan visiMapScan; AOTupleId aoTupleId; } Context; FuncCallContext *funcctx; Context *context; if (SRF_IS_FIRSTCALL()) { TupleDesc tupdesc; MemoryContext oldcontext; /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); /* * switch to memory context appropriate for multiple function * calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* build tupdesc for result tuples */ tupdesc = CreateTemplateTupleDesc(3, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "tid", TIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "segno", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "row_num", INT8OID, -1, 0); funcctx->tuple_desc = BlessTupleDesc(tupdesc); /* * Collect all the locking information that we will format and send * out as a result set. */ context = (Context *) palloc0(sizeof(Context)); context->aorel = heap_open(aoRelOid, AccessShareLock); if (!(RelationIsAoRows(context->aorel) || RelationIsAoCols(context->aorel))) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Function not supported on relation"))); } AppendOnlyVisimapScan_Init(&context->visiMapScan, context->aorel->rd_appendonly->visimaprelid, context->aorel->rd_appendonly->visimapidxid, AccessShareLock, SnapshotNow); AOTupleIdInit_Init(&context->aoTupleId); funcctx->user_fctx = (void *) context; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); context = (Context *) funcctx->user_fctx; while (true) { if (!AppendOnlyVisimapScan_GetNextInvisible( &context->visiMapScan, &context->aoTupleId)) { break; } MemSet(values, 0, sizeof(values)); MemSet(nulls, false, sizeof(nulls)); values[0] = ItemPointerGetDatum((ItemPointer)&context->aoTupleId); values[1] = Int32GetDatum(AOTupleIdGet_segmentFileNum(&context->aoTupleId)); values[2] = Int64GetDatum(AOTupleIdGet_rowNum(&context->aoTupleId)); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } AppendOnlyVisimapScan_Finish(&context->visiMapScan, AccessShareLock); heap_close(context->aorel, AccessShareLock); pfree(context); funcctx->user_fctx = NULL; SRF_RETURN_DONE(funcctx); }
/* * get_relation_info - * Retrieves catalog information for a given relation. * * Given the Oid of the relation, return the following info into fields * of the RelOptInfo struct: * * min_attr lowest valid AttrNumber * max_attr highest valid AttrNumber * indexlist list of IndexOptInfos for relation's indexes * pages number of pages * tuples number of tuples * * Also, initialize the attr_needed[] and attr_widths[] arrays. In most * cases these are left as zeroes, but sometimes we need to compute attr * widths here, and we may as well cache the results for costsize.c. * * If inhparent is true, all we need to do is set up the attr arrays: * the RelOptInfo actually represents the appendrel formed by an inheritance * tree, and so the parent rel's physical size and index information isn't * important for it. */ void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel) { Index varno = rel->relid; Relation relation; bool hasindex; List *indexinfos = NIL; bool needs_longlock; /* * We need not lock the relation since it was already locked, either by * the rewriter or when expand_inherited_rtentry() added it to the query's * rangetable. */ relation = heap_open(relationObjectId, NoLock); needs_longlock = rel_needs_long_lock(relationObjectId); rel->min_attr = FirstLowInvalidHeapAttributeNumber + 1; rel->max_attr = RelationGetNumberOfAttributes(relation); Assert(rel->max_attr >= rel->min_attr); rel->attr_needed = (Relids *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); rel->attr_widths = (int32 *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); /* * CDB: Get partitioning key info for distributed relation. */ rel->cdbpolicy = RelationGetPartitioningKey(relation); /* * Estimate relation size --- unless it's an inheritance parent, in which * case the size will be computed later in set_append_rel_pathlist, and we * must leave it zero for now to avoid bollixing the total_table_pages * calculation. */ if (!inhparent) { cdb_estimate_rel_size( rel, relation, relation, rel->attr_widths - rel->min_attr, &rel->pages, &rel->tuples, &rel->cdb_default_stats_used ); } /* * Make list of indexes. Ignore indexes on system catalogs if told to. * Don't bother with indexes for an inheritance parent, either. */ if (inhparent || (IgnoreSystemIndexes && IsSystemClass(relation->rd_rel))) hasindex = false; else hasindex = relation->rd_rel->relhasindex; if (hasindex) { List *indexoidlist; ListCell *l; LOCKMODE lmode; /* Warn if indexed table needs ANALYZE. */ if (rel->cdb_default_stats_used) cdb_default_stats_warning_for_table(relation->rd_id); indexoidlist = RelationGetIndexList(relation); /* * For each index, we get the same type of lock that the executor will * need, and do not release it. This saves a couple of trips to the * shared lock manager while not creating any real loss of * concurrency, because no schema changes could be happening on the * index while we hold lock on the parent rel, and neither lock type * blocks any other kind of index operation. */ if (rel->relid == root->parse->resultRelation) lmode = RowExclusiveLock; else lmode = AccessShareLock; foreach(l, indexoidlist) { Oid indexoid = lfirst_oid(l); Relation indexRelation; Form_pg_index index; IndexOptInfo *info; int ncolumns; int i; /* * Extract info from the relation descriptor for the index. */ indexRelation = index_open(indexoid, lmode); index = indexRelation->rd_index; /* * Ignore invalid indexes, since they can't safely be used for * queries. Note that this is OK because the data structure we * are constructing is only used by the planner --- the executor * still needs to insert into "invalid" indexes, if they're marked * IndexIsReady. */ if (!IndexIsValid(index)) { index_close(indexRelation, NoLock); continue; } /* * If the index is valid, but cannot yet be used, ignore it; but * mark the plan we are generating as transient. See * src/backend/access/heap/README.HOT for discussion. */ if (index->indcheckxmin && !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data), TransactionXmin)) { root->glob->transientPlan = true; index_close(indexRelation, NoLock); continue; } info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; info->rel = rel; info->ncolumns = ncolumns = index->indnatts; /* * Allocate per-column info arrays. To save a few palloc cycles * we allocate all the Oid-type arrays in one request. Note that * the opfamily array needs an extra, terminating zero at the end. * We pre-zero the ordering info in case the index is unordered. */ info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); info->opfamily = (Oid *) palloc0(sizeof(Oid) * (4 * ncolumns + 1)); info->opcintype = info->opfamily + (ncolumns + 1); info->fwdsortop = info->opcintype + ncolumns; info->revsortop = info->fwdsortop + ncolumns; info->nulls_first = (bool *) palloc0(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { info->indexkeys[i] = index->indkey.values[i]; info->opfamily[i] = indexRelation->rd_opfamily[i]; info->opcintype[i] = indexRelation->rd_opcintype[i]; } info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; info->amsearchnulls = indexRelation->rd_am->amsearchnulls; /* * Fetch the ordering operators associated with the index, if any. * We expect that all ordering-capable indexes use btree's * strategy numbers for the ordering operators. */ if (indexRelation->rd_am->amcanorder) { int nstrat = indexRelation->rd_am->amstrategies; for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; int fwdstrat; int revstrat; if (opt & INDOPTION_DESC) { fwdstrat = BTGreaterStrategyNumber; revstrat = BTLessStrategyNumber; } else { fwdstrat = BTLessStrategyNumber; revstrat = BTGreaterStrategyNumber; } /* * Index AM must have a fixed set of strategies for it to * make sense to specify amcanorder, so we need not allow * the case amstrategies == 0. */ if (fwdstrat > 0) { Assert(fwdstrat <= nstrat); info->fwdsortop[i] = indexRelation->rd_operator[i * nstrat + fwdstrat - 1]; } if (revstrat > 0) { Assert(revstrat <= nstrat); info->revsortop[i] = indexRelation->rd_operator[i * nstrat + revstrat - 1]; } info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; } } /* * Fetch the index expressions and predicate, if any. We must * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); if (info->indexprs && varno != 1) ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); if (info->indpred && varno != 1) ChangeVarNodes((Node *) info->indpred, 1, varno, 0); info->predOK = false; /* set later in indxpath.c */ info->unique = index->indisunique; /* * Estimate the index size. If it's not a partial index, we lock * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger * than the table. */ cdb_estimate_rel_size(rel, relation, indexRelation, NULL, &info->pages, &info->tuples, &info->cdb_default_stats_used); if (!info->indpred || info->tuples > rel->tuples) info->tuples = rel->tuples; if (info->cdb_default_stats_used && !rel->cdb_default_stats_used) cdb_default_stats_warning_for_index(relation->rd_id, indexoid); index_close(indexRelation, needs_longlock ? NoLock : lmode); indexinfos = lcons(info, indexinfos); } list_free(indexoidlist); }
Datum crosstab(PG_FUNCTION_ARGS) { char *sql = text_to_cstring(PG_GETARG_TEXT_PP(0)); ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; Tuplestorestate *tupstore; TupleDesc tupdesc; int call_cntr; int max_calls; AttInMetadata *attinmeta; SPITupleTable *spi_tuptable; TupleDesc spi_tupdesc; bool firstpass; char *lastrowid; int i; int num_categories; MemoryContext per_query_ctx; MemoryContext oldcontext; int ret; int proc; /* check to see if caller supports us returning a tuplestore */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (!(rsinfo->allowedModes & SFRM_Materialize)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("materialize mode required, but it is not " \ "allowed in this context"))); per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; /* Connect to SPI manager */ if ((ret = SPI_connect()) < 0) /* internal error */ elog(ERROR, "crosstab: SPI_connect returned %d", ret); /* Retrieve the desired rows */ ret = SPI_execute(sql, true, 0); proc = SPI_processed; /* If no qualifying tuples, fall out early */ if (ret != SPI_OK_SELECT || proc <= 0) { SPI_finish(); rsinfo->isDone = ExprEndResult; PG_RETURN_NULL(); } spi_tuptable = SPI_tuptable; spi_tupdesc = spi_tuptable->tupdesc; /*---------- * The provided SQL query must always return three columns. * * 1. rowname * the label or identifier for each row in the final result * 2. category * the label or identifier for each column in the final result * 3. values * the value for each column in the final result *---------- */ if (spi_tupdesc->natts != 3) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid source data SQL statement"), errdetail("The provided SQL must return 3 " "columns: rowid, category, and values."))); /* get a tuple descriptor for our result type */ switch (get_call_result_type(fcinfo, NULL, &tupdesc)) { case TYPEFUNC_COMPOSITE: /* success */ break; case TYPEFUNC_RECORD: /* failed to determine actual type of RECORD */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("function returning record called in context " "that cannot accept type record"))); break; default: /* result type isn't composite */ elog(ERROR, "return type must be a row type"); break; } /* * Check that return tupdesc is compatible with the data we got from SPI, * at least based on number and type of attributes */ if (!compatCrosstabTupleDescs(tupdesc, spi_tupdesc)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("return and sql tuple descriptions are " \ "incompatible"))); /* * switch to long-lived memory context */ oldcontext = MemoryContextSwitchTo(per_query_ctx); /* make sure we have a persistent copy of the result tupdesc */ tupdesc = CreateTupleDescCopy(tupdesc); /* initialize our tuplestore in long-lived context */ tupstore = tuplestore_begin_heap(rsinfo->allowedModes & SFRM_Materialize_Random, false, work_mem); MemoryContextSwitchTo(oldcontext); /* * Generate attribute metadata needed later to produce tuples from raw C * strings */ attinmeta = TupleDescGetAttInMetadata(tupdesc); /* total number of tuples to be examined */ max_calls = proc; /* the return tuple always must have 1 rowid + num_categories columns */ num_categories = tupdesc->natts - 1; firstpass = true; lastrowid = NULL; for (call_cntr = 0; call_cntr < max_calls; call_cntr++) { bool skip_tuple = false; char **values; /* allocate and zero space */ values = (char **) palloc0((1 + num_categories) * sizeof(char *)); /* * now loop through the sql results and assign each value in sequence * to the next category */ for (i = 0; i < num_categories; i++) { HeapTuple spi_tuple; char *rowid; /* see if we've gone too far already */ if (call_cntr >= max_calls) break; /* get the next sql result tuple */ spi_tuple = spi_tuptable->vals[call_cntr]; /* get the rowid from the current sql result tuple */ rowid = SPI_getvalue(spi_tuple, spi_tupdesc, 1); /* * If this is the first pass through the values for this rowid, * set the first column to rowid */ if (i == 0) { xpstrdup(values[0], rowid); /* * Check to see if the rowid is the same as that of the last * tuple sent -- if so, skip this tuple entirely */ if (!firstpass && xstreq(lastrowid, rowid)) { xpfree(rowid); skip_tuple = true; break; } } /* * If rowid hasn't changed on us, continue building the output * tuple. */ if (xstreq(rowid, values[0])) { /* * Get the next category item value, which is always attribute * number three. * * Be careful to assign the value to the array index based on * which category we are presently processing. */ values[1 + i] = SPI_getvalue(spi_tuple, spi_tupdesc, 3); /* * increment the counter since we consume a row for each * category, but not for last pass because the outer loop will * do that for us */ if (i < (num_categories - 1)) call_cntr++; xpfree(rowid); } else { /* * We'll fill in NULLs for the missing values, but we need to * decrement the counter since this sql result row doesn't * belong to the current output tuple. */ call_cntr--; xpfree(rowid); break; } } if (!skip_tuple) { HeapTuple tuple; /* build the tuple and store it */ tuple = BuildTupleFromCStrings(attinmeta, values); tuplestore_puttuple(tupstore, tuple); heap_freetuple(tuple); } /* Remember current rowid */ xpfree(lastrowid); xpstrdup(lastrowid, values[0]); firstpass = false; /* Clean up */ for (i = 0; i < num_categories + 1; i++) if (values[i] != NULL) pfree(values[i]); pfree(values); } /* let the caller know we're sending back a tuplestore */ rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; /* release SPI related resources (and return to caller's context) */ SPI_finish(); return (Datum) 0; }
/* * ConstructTupleDescriptor * * Build an index tuple descriptor for a new index */ static TupleDesc ConstructTupleDescriptor(Relation heapRelation, IndexInfo *indexInfo, Oid *classObjectId) { int numatts = indexInfo->ii_NumIndexAttrs; List *indexprs = indexInfo->ii_Expressions; TupleDesc heapTupDesc; TupleDesc indexTupDesc; int natts; /* #atts in heap rel --- for error checks */ int i; heapTupDesc = RelationGetDescr(heapRelation); natts = RelationGetForm(heapRelation)->relnatts; /* * allocate the new tuple descriptor */ indexTupDesc = CreateTemplateTupleDesc(numatts, false); /* * For simple index columns, we copy the pg_attribute row from the * parent relation and modify it as necessary. For expressions we * have to cons up a pg_attribute row the hard way. */ for (i = 0; i < numatts; i++) { AttrNumber atnum = indexInfo->ii_KeyAttrNumbers[i]; Form_pg_attribute to; HeapTuple tuple; Form_pg_type typeTup; Oid keyType; indexTupDesc->attrs[i] = to = (Form_pg_attribute) palloc0(ATTRIBUTE_TUPLE_SIZE); if (atnum != 0) { /* Simple index column */ Form_pg_attribute from; if (atnum < 0) { /* * here we are indexing on a system attribute (-1...-n) */ from = SystemAttributeDefinition(atnum, heapRelation->rd_rel->relhasoids); } else { /* * here we are indexing on a normal attribute (1...n) */ if (atnum > natts) /* safety check */ elog(ERROR, "invalid column number %d", atnum); from = heapTupDesc->attrs[AttrNumberGetAttrOffset(atnum)]; } /* * now that we've determined the "from", let's copy the tuple * desc data... */ memcpy(to, from, ATTRIBUTE_TUPLE_SIZE); /* * Fix the stuff that should not be the same as the underlying * attr */ to->attnum = i + 1; to->attstattarget = 0; to->attcacheoff = -1; to->attnotnull = false; to->atthasdef = false; to->attislocal = true; to->attinhcount = 0; } else { /* Expressional index */ Node *indexkey; if (indexprs == NIL) /* shouldn't happen */ elog(ERROR, "too few entries in indexprs list"); indexkey = (Node *) lfirst(indexprs); indexprs = lnext(indexprs); /* * Make the attribute's name "pg_expresssion_nnn" (maybe think * of something better later) */ sprintf(NameStr(to->attname), "pg_expression_%d", i + 1); /* * Lookup the expression type in pg_type for the type length * etc. */ keyType = exprType(indexkey); tuple = SearchSysCache(TYPEOID, ObjectIdGetDatum(keyType), 0, 0, 0); if (!HeapTupleIsValid(tuple)) elog(ERROR, "cache lookup failed for type %u", keyType); typeTup = (Form_pg_type) GETSTRUCT(tuple); /* * Assign some of the attributes values. Leave the rest as 0. */ to->attnum = i + 1; to->atttypid = keyType; to->attlen = typeTup->typlen; to->attbyval = typeTup->typbyval; to->attstorage = typeTup->typstorage; to->attalign = typeTup->typalign; to->attcacheoff = -1; to->atttypmod = -1; to->attislocal = true; ReleaseSysCache(tuple); } /* * We do not yet have the correct relation OID for the index, so * just set it invalid for now. InitializeAttributeOids() will * fix it later. */ to->attrelid = InvalidOid; /* * Check the opclass to see if it provides a keytype (overriding * the attribute type). */ tuple = SearchSysCache(CLAOID, ObjectIdGetDatum(classObjectId[i]), 0, 0, 0); if (!HeapTupleIsValid(tuple)) elog(ERROR, "cache lookup failed for opclass %u", classObjectId[i]); keyType = ((Form_pg_opclass) GETSTRUCT(tuple))->opckeytype; ReleaseSysCache(tuple); if (OidIsValid(keyType) && keyType != to->atttypid) { /* index value and heap value have different types */ tuple = SearchSysCache(TYPEOID, ObjectIdGetDatum(keyType), 0, 0, 0); if (!HeapTupleIsValid(tuple)) elog(ERROR, "cache lookup failed for type %u", keyType); typeTup = (Form_pg_type) GETSTRUCT(tuple); to->atttypid = keyType; to->atttypmod = -1; to->attlen = typeTup->typlen; to->attbyval = typeTup->typbyval; to->attalign = typeTup->typalign; to->attstorage = typeTup->typstorage; ReleaseSysCache(tuple); } } return indexTupDesc; }
/* * Look to see if we have template information for the given language name. */ static PLTemplate * find_language_template(const char *languageName) { PLTemplate *result; Relation rel; SysScanDesc scan; ScanKeyData key; HeapTuple tup; rel = heap_open(PLTemplateRelationId, AccessShareLock); ScanKeyInit(&key, Anum_pg_pltemplate_tmplname, BTEqualStrategyNumber, F_NAMEEQ, CStringGetDatum(languageName)); scan = systable_beginscan(rel, PLTemplateNameIndexId, true, SnapshotNow, 1, &key); tup = systable_getnext(scan); if (HeapTupleIsValid(tup)) { Form_pg_pltemplate tmpl = (Form_pg_pltemplate) GETSTRUCT(tup); Datum datum; bool isnull; result = (PLTemplate *) palloc0(sizeof(PLTemplate)); result->tmpltrusted = tmpl->tmpltrusted; result->tmpldbacreate = tmpl->tmpldbacreate; /* Remaining fields are variable-width so we need heap_getattr */ datum = heap_getattr(tup, Anum_pg_pltemplate_tmplhandler, RelationGetDescr(rel), &isnull); if (!isnull) result->tmplhandler = TextDatumGetCString(datum); datum = heap_getattr(tup, Anum_pg_pltemplate_tmplinline, RelationGetDescr(rel), &isnull); if (!isnull) result->tmplinline = TextDatumGetCString(datum); datum = heap_getattr(tup, Anum_pg_pltemplate_tmplvalidator, RelationGetDescr(rel), &isnull); if (!isnull) result->tmplvalidator = TextDatumGetCString(datum); datum = heap_getattr(tup, Anum_pg_pltemplate_tmpllibrary, RelationGetDescr(rel), &isnull); if (!isnull) result->tmpllibrary = TextDatumGetCString(datum); /* Ignore template if handler or library info is missing */ if (!result->tmplhandler || !result->tmpllibrary) result = NULL; } else result = NULL; systable_endscan(scan); heap_close(rel, AccessShareLock); return result; }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum hashbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation rel = info->index; double tuples_removed; double num_index_tuples; double orig_ntuples; Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; Buffer metabuf; HashMetaPage metap; HashMetaPageData local_metapage; tuples_removed = 0; num_index_tuples = 0; /* * Read the metapage to fetch original bucket and tuple counts. Also, we * keep a copy of the last-seen metapage so that we can use its * hashm_spares[] values to compute bucket page addresses. This is a bit * hokey but perfectly safe, since the interesting entries in the spares * array cannot change under us; and it beats rereading the metapage for * each bucket. */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); orig_maxbucket = metap->hashm_maxbucket; orig_ntuples = metap->hashm_ntuples; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); /* Scan the buckets that we know exist */ cur_bucket = 0; cur_maxbucket = orig_maxbucket; loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; bool bucket_dirty = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); /* Exclusive-lock the bucket so we can shrink it */ _hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Shouldn't have any active scans locally, either */ if (_hash_has_active_scan(rel, cur_bucket)) elog(ERROR, "hash index has active scan during VACUUM"); /* Scan each page in bucket */ blkno = bucket_blkno; while (BlockNumberIsValid(blkno)) { Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; vacuum_delay_point(); buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, info->strategy); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == cur_bucket); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); if (callback(htup, callback_state)) { /* mark the item for deletion */ deletable[ndeletable++] = offno; tuples_removed += 1; } else num_index_tuples += 1; } /* * Apply deletions and write page if needed, advance to next page. */ blkno = opaque->hasho_nextblkno; if (ndeletable > 0) { PageIndexMultiDelete(page, deletable, ndeletable); _hash_wrtbuf(rel, buf); bucket_dirty = true; } else _hash_relbuf(rel, buf); } /* If we deleted anything, try to compact free space */ if (bucket_dirty) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, info->strategy); /* Release bucket lock */ _hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Advance to next bucket */ cur_bucket++; } /* Write-lock metapage and check for split since we started */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ cur_maxbucket = metap->hashm_maxbucket; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, so * believe our count as gospel. */ metap->hashm_ntuples = num_index_tuples; } else { /* * Otherwise, our count is untrustworthy since we may have * double-scanned tuples in split buckets. Proceed by dead-reckoning. * (Note: we still return estimated_count = false, because using this * count is better than not updating reltuples at all.) */ if (metap->hashm_ntuples > tuples_removed) metap->hashm_ntuples -= tuples_removed; else metap->hashm_ntuples = 0; num_index_tuples = metap->hashm_ntuples; } _hash_wrtbuf(rel, metabuf); /* return statistics */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats->estimated_count = false; stats->num_index_tuples = num_index_tuples; stats->tuples_removed += tuples_removed; /* hashvacuumcleanup will fill in num_pages */ PG_RETURN_POINTER(stats); }
/* * CompactCheckpointerRequestQueue * Remove duplicates from the request queue to avoid backend fsyncs. * Returns "true" if any entries were removed. * * Although a full fsync request queue is not common, it can lead to severe * performance problems when it does happen. So far, this situation has * only been observed to occur when the system is under heavy write load, * and especially during the "sync" phase of a checkpoint. Without this * logic, each backend begins doing an fsync for every block written, which * gets very expensive and can slow down the whole system. * * Trying to do this every time the queue is full could lose if there * aren't any removable entries. But that should be vanishingly rare in * practice: there's one queue entry per shared buffer. */ static bool CompactCheckpointerRequestQueue(void) { struct CheckpointerSlotMapping { CheckpointerRequest request; int slot; }; int n, preserve_count; int num_skipped = 0; HASHCTL ctl; HTAB *htab; bool *skip_slot; /* must hold CheckpointerCommLock in exclusive mode */ Assert(LWLockHeldByMe(CheckpointerCommLock)); /* Initialize skip_slot array */ skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests); /* Initialize temporary hash table */ MemSet(&ctl, 0, sizeof(ctl)); ctl.keysize = sizeof(CheckpointerRequest); ctl.entrysize = sizeof(struct CheckpointerSlotMapping); ctl.hcxt = CurrentMemoryContext; htab = hash_create("CompactCheckpointerRequestQueue", CheckpointerShmem->num_requests, &ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); /* * The basic idea here is that a request can be skipped if it's followed * by a later, identical request. It might seem more sensible to work * backwards from the end of the queue and check whether a request is * *preceded* by an earlier, identical request, in the hopes of doing less * copying. But that might change the semantics, if there's an * intervening FORGET_RELATION_FSYNC or FORGET_DATABASE_FSYNC request, so * we do it this way. It would be possible to be even smarter if we made * the code below understand the specific semantics of such requests (it * could blow away preceding entries that would end up being canceled * anyhow), but it's not clear that the extra complexity would buy us * anything. */ for (n = 0; n < CheckpointerShmem->num_requests; n++) { CheckpointerRequest *request; struct CheckpointerSlotMapping *slotmap; bool found; /* * We use the request struct directly as a hashtable key. This * assumes that any padding bytes in the structs are consistently the * same, which should be okay because we zeroed them in * CheckpointerShmemInit. Note also that RelFileNode had better * contain no pad bytes. */ request = &CheckpointerShmem->requests[n]; slotmap = hash_search(htab, request, HASH_ENTER, &found); if (found) { /* Duplicate, so mark the previous occurrence as skippable */ skip_slot[slotmap->slot] = true; num_skipped++; } /* Remember slot containing latest occurrence of this request value */ slotmap->slot = n; } /* Done with the hash table. */ hash_destroy(htab); /* If no duplicates, we're out of luck. */ if (!num_skipped) { pfree(skip_slot); return false; } /* We found some duplicates; remove them. */ preserve_count = 0; for (n = 0; n < CheckpointerShmem->num_requests; n++) { if (skip_slot[n]) continue; CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n]; } ereport(DEBUG1, (errmsg("compacted fsync request queue from %d entries to %d entries", CheckpointerShmem->num_requests, preserve_count))); CheckpointerShmem->num_requests = preserve_count; /* Cleanup. */ pfree(skip_slot); return true; }
/* * gp_read_error_log * * Returns set of error log tuples. */ Datum gp_read_error_log(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; ReadErrorLogContext *context; HeapTuple tuple; Datum result; /* * First call setup */ if (SRF_IS_FIRSTCALL()) { MemoryContext oldcontext; FILE *fp; text *relname; funcctx = SRF_FIRSTCALL_INIT(); relname = PG_GETARG_TEXT_P(0); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); context = palloc0(sizeof(ReadErrorLogContext)); funcctx->user_fctx = (void *) context; funcctx->tuple_desc = BlessTupleDesc(GetErrorTupleDesc()); /* * Though this function is usually executed on segment, we dispatch * the execution if it happens to be on QD, and combine the results * into one set. */ if (Gp_role == GP_ROLE_DISPATCH) { int resultCount = 0; PGresult **results = NULL; StringInfoData sql; StringInfoData errbuf; int i; initStringInfo(&sql); initStringInfo(&errbuf); /* * construct SQL */ appendStringInfo(&sql, "SELECT * FROM pg_catalog.gp_read_error_log(%s) ", quote_literal_internal(text_to_cstring(relname))); results = cdbdisp_dispatchRMCommand(sql.data, true, &errbuf, &resultCount); if (errbuf.len > 0) elog(ERROR, "%s", errbuf.data); Assert(resultCount > 0); for (i = 0; i < resultCount; i++) { if (PQresultStatus(results[i]) != PGRES_TUPLES_OK) elog(ERROR, "unexpected result from segment: %d", PQresultStatus(results[i])); context->numTuples += PQntuples(results[i]); } pfree(errbuf.data); pfree(sql.data); context->segResults = results; context->numSegResults = resultCount; } else { /* * In QE, read the error log. */ RangeVar *relrv; Oid relid; relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); relid = RangeVarGetRelid(relrv, true); /* * If the relation has gone, silently return no tuples. */ if (OidIsValid(relid)) { AclResult aclresult; /* * Requires SELECT priv to read error log. */ aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, relrv->relname); ErrorLogFileName(context->filename, MyDatabaseId, relid); fp = AllocateFile(context->filename, "r"); context->fp = fp; } } MemoryContextSwitchTo(oldcontext); if (Gp_role != GP_ROLE_DISPATCH && !context->fp) { pfree(context); SRF_RETURN_DONE(funcctx); } } funcctx = SRF_PERCALL_SETUP(); context = (ReadErrorLogContext *) funcctx->user_fctx; /* * Read error log, probably on segments. We don't check Gp_role, however, * in case master also wants to read the file. */ if (context->fp) { pg_crc32 crc, written_crc; tuple = ErrorLogRead(context->fp, &written_crc); /* * CRC check. */ if (HeapTupleIsValid(tuple)) { INIT_CRC32C(crc); COMP_CRC32C(crc, tuple->t_data, tuple->t_len); FIN_CRC32C(crc); if (!EQ_CRC32C(crc, written_crc)) { elog(LOG, "incorrect checksum in error log %s", context->filename); tuple = NULL; } } /* * If we found a valid tuple, return it. Otherwise, fall through * in the DONE routine. */ if (HeapTupleIsValid(tuple)) { /* * We need to set typmod for the executor to understand * its type we just blessed. */ HeapTupleHeaderSetTypMod(tuple->t_data, funcctx->tuple_desc->tdtypmod); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } } /* * If we got results from dispatch, return all the tuples. */ while (context->currentResult < context->numSegResults) { Datum values[NUM_ERRORTABLE_ATTR]; bool isnull[NUM_ERRORTABLE_ATTR]; PGresult *segres = context->segResults[context->currentResult]; int row = context->currentRow; if (row >= PQntuples(segres)) { context->currentRow = 0; context->currentResult++; continue; } context->currentRow++; MemSet(isnull, false, sizeof(isnull)); values[0] = ResultToDatum(segres, row, 0, timestamptz_in, &isnull[0]); values[1] = ResultToDatum(segres, row, 1, textin, &isnull[1]); values[2] = ResultToDatum(segres, row, 2, textin, &isnull[2]); values[3] = ResultToDatum(segres, row, 3, int4in, &isnull[3]); values[4] = ResultToDatum(segres, row, 4, int4in, &isnull[4]); values[5] = ResultToDatum(segres, row, 5, textin, &isnull[5]); values[6] = ResultToDatum(segres, row, 6, textin, &isnull[6]); values[7] = ResultToDatum(segres, row, 7, byteain, &isnull[7]); tuple = heap_form_tuple(funcctx->tuple_desc, values, isnull); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } if (context->segResults != NULL) { int i; for (i = 0; i < context->numSegResults; i++) PQclear(context->segResults[i]); /* XXX: better to copy to palloc'ed area */ free(context->segResults); } /* * Close the file, if we have opened it. */ if (context->fp != NULL) { FreeFile(context->fp); context->fp = NULL; } SRF_RETURN_DONE(funcctx); }
Datum ltree_in(PG_FUNCTION_ARGS) { char *buf = (char *) PG_GETARG_POINTER(0); char *ptr; nodeitem *list, *lptr; int num = 0, totallen = 0; int state = LTPRS_WAITNAME; ltree *result; ltree_level *curlevel; int charlen; int pos = 0; ptr = buf; while (*ptr) { charlen = pg_mblen(ptr); if (charlen == 1 && t_iseq(ptr, '.')) num++; ptr += charlen; } if (num + 1 > MaxAllocSize / sizeof(nodeitem)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("number of levels (%d) exceeds the maximum allowed (%d)", num + 1, (int) (MaxAllocSize / sizeof(nodeitem))))); list = lptr = (nodeitem *) palloc(sizeof(nodeitem) * (num + 1)); ptr = buf; while (*ptr) { charlen = pg_mblen(ptr); if (state == LTPRS_WAITNAME) { if (ISALNUM(ptr)) { lptr->start = ptr; lptr->wlen = 0; state = LTPRS_WAITDELIM; } else UNCHAR; } else if (state == LTPRS_WAITDELIM) { if (charlen == 1 && t_iseq(ptr, '.')) { lptr->len = ptr - lptr->start; if (lptr->wlen > 255) ereport(ERROR, (errcode(ERRCODE_NAME_TOO_LONG), errmsg("name of level is too long"), errdetail("Name length is %d, must " "be < 256, in position %d.", lptr->wlen, pos))); totallen += MAXALIGN(lptr->len + LEVEL_HDRSIZE); lptr++; state = LTPRS_WAITNAME; } else if (!ISALNUM(ptr)) UNCHAR; } else /* internal error */ elog(ERROR, "internal error in parser"); ptr += charlen; lptr->wlen++; pos++; } if (state == LTPRS_WAITDELIM) { lptr->len = ptr - lptr->start; if (lptr->wlen > 255) ereport(ERROR, (errcode(ERRCODE_NAME_TOO_LONG), errmsg("name of level is too long"), errdetail("Name length is %d, must " "be < 256, in position %d.", lptr->wlen, pos))); totallen += MAXALIGN(lptr->len + LEVEL_HDRSIZE); lptr++; } else if (!(state == LTPRS_WAITNAME && lptr == list)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), errdetail("Unexpected end of line."))); result = (ltree *) palloc0(LTREE_HDRSIZE + totallen); SET_VARSIZE(result, LTREE_HDRSIZE + totallen); result->numlevel = lptr - list; curlevel = LTREE_FIRST(result); lptr = list; while (lptr - list < result->numlevel) { curlevel->len = (uint16) lptr->len; memcpy(curlevel->name, lptr->start, lptr->len); curlevel = LEVEL_NEXT(curlevel); lptr++; } pfree(list); PG_RETURN_POINTER(result); }
/* * Collects info about fields of a composite type. * * Based on TupleDescGetAttInMetadata. */ ProxyComposite * plproxy_composite_info(ProxyFunction *func, TupleDesc tupdesc) { int i, natts = tupdesc->natts; ProxyComposite *ret; MemoryContext old_ctx; Form_pg_attribute a; ProxyType *type; const char *name; Oid oid = tupdesc->tdtypeid; old_ctx = MemoryContextSwitchTo(func->ctx); ret = palloc(sizeof(*ret)); ret->type_list = palloc(sizeof(ProxyType *) * natts); ret->name_list = palloc0(sizeof(char *) * natts); ret->tupdesc = BlessTupleDesc(tupdesc); ret->use_binary = 1; ret->alterable = 0; if (oid != RECORDOID) { HeapTuple type_tuple; HeapTuple rel_tuple; Form_pg_type pg_type; type_tuple = SearchSysCache(TYPEOID, ObjectIdGetDatum(oid), 0, 0, 0); if (!HeapTupleIsValid(type_tuple)) elog(ERROR, "cache lookup failed for type %u", oid); pg_type = (Form_pg_type) GETSTRUCT(type_tuple); rel_tuple = SearchSysCache(RELOID, ObjectIdGetDatum(pg_type->typrelid), 0, 0, 0); if (!HeapTupleIsValid(rel_tuple)) elog(ERROR, "cache lookup failed for type relation %u", pg_type->typrelid); plproxy_set_stamp(&ret->stamp, rel_tuple); ReleaseSysCache(rel_tuple); ReleaseSysCache(type_tuple); ret->alterable = 1; if (ret->tupdesc->tdtypeid != oid) elog(ERROR, "lost oid"); } MemoryContextSwitchTo(old_ctx); ret->nfields = 0; for (i = 0; i < natts; i++) { a = TupleDescAttr(tupdesc, i); if (a->attisdropped) { ret->name_list[i] = NULL; ret->type_list[i] = NULL; continue; } ret->nfields++; name = quote_identifier(NameStr(a->attname)); ret->name_list[i] = plproxy_func_strdup(func, name); type = plproxy_find_type_info(func, a->atttypid, 0); ret->type_list[i] = type; if (!type->has_recv) ret->use_binary = 0; } return ret; }
/* * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation * * This routine vacuums a single heap, cleans out its indexes, and * updates its relpages and reltuples statistics. * * At entry, we have already established a transaction and opened * and locked the relation. */ void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, List *updated_stats) { LVRelStats *vacrelstats; Relation *Irel; int nindexes; BlockNumber possibly_freeable; if (vacstmt->verbose) elevel = INFO; else elevel = DEBUG2; if (Gp_role == GP_ROLE_DISPATCH) elevel = DEBUG2; /* vacuum and analyze messages aren't interesting from the QD */ #ifdef FAULT_INJECTOR if (vacuumStatement_IsInAppendOnlyDropPhase(vacstmt)) { FaultInjector_InjectFaultIfSet( CompactionBeforeSegmentFileDropPhase, DDLNotSpecified, "", // databaseName ""); // tableName } if (vacummStatement_IsInAppendOnlyCleanupPhase(vacstmt)) { FaultInjector_InjectFaultIfSet( CompactionBeforeCleanupPhase, DDLNotSpecified, "", // databaseName ""); // tableName } #endif /* * MPP-23647. Update xid limits for heap as well as appendonly * relations. This allows setting relfrozenxid to correct value * for an appendonly (AO/CO) table. */ vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared, &OldestXmin, &FreezeLimit); /* * Execute the various vacuum operations. Appendonly tables are treated * differently. */ if (RelationIsAoRows(onerel) || RelationIsAoCols(onerel)) { lazy_vacuum_aorel(onerel, vacstmt, updated_stats); return; } vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); /* heap relation */ /* Set threshold for interesting free space = average request size */ /* XXX should we scale it up or down? Adjust vacuum.c too, if so */ vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node); /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); /* Do the vacuuming */ lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, updated_stats, vacstmt->extra_oids); /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); /* * Optionally truncate the relation. * * Don't even think about it unless we have a shot at releasing a goodly * number of pages. Otherwise, the time taken isn't worth it. */ possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; if (possibly_freeable >= REL_TRUNCATE_MINIMUM || possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION) lazy_truncate_heap(onerel, vacrelstats); /* Update shared free space map with final free space info */ lazy_update_fsm(onerel, vacrelstats); /* Update statistics in pg_class */ vac_update_relstats(onerel, vacrelstats->rel_pages, vacrelstats->rel_tuples, vacrelstats->hasindex, FreezeLimit, updated_stats); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, true /*vacrelstats->scanned_all*/, vacstmt->analyze, vacrelstats->rel_tuples); }
/* ---------------------------------------------------------------- * ExecInitGather * ---------------------------------------------------------------- */ GatherMergeState * ExecInitGatherMerge(GatherMerge *node, EState *estate, int eflags) { GatherMergeState *gm_state; Plan *outerNode; bool hasoid; TupleDesc tupDesc; /* Gather merge node doesn't have innerPlan node. */ Assert(innerPlan(node) == NULL); /* * create state structure */ gm_state = makeNode(GatherMergeState); gm_state->ps.plan = (Plan *) node; gm_state->ps.state = estate; gm_state->ps.ExecProcNode = ExecGatherMerge; /* * Miscellaneous initialization * * create expression context for node */ ExecAssignExprContext(estate, &gm_state->ps); /* * initialize child expressions */ gm_state->ps.qual = ExecInitQual(node->plan.qual, &gm_state->ps); /* * tuple table initialization */ ExecInitResultTupleSlot(estate, &gm_state->ps); /* * now initialize outer plan */ outerNode = outerPlan(node); outerPlanState(gm_state) = ExecInitNode(outerNode, estate, eflags); /* * Initialize result tuple type and projection info. */ ExecAssignResultTypeFromTL(&gm_state->ps); ExecAssignProjectionInfo(&gm_state->ps, NULL); gm_state->gm_initialized = false; /* * initialize sort-key information */ if (node->numCols) { int i; gm_state->gm_nkeys = node->numCols; gm_state->gm_sortkeys = palloc0(sizeof(SortSupportData) * node->numCols); for (i = 0; i < node->numCols; i++) { SortSupport sortKey = gm_state->gm_sortkeys + i; sortKey->ssup_cxt = CurrentMemoryContext; sortKey->ssup_collation = node->collations[i]; sortKey->ssup_nulls_first = node->nullsFirst[i]; sortKey->ssup_attno = node->sortColIdx[i]; /* * We don't perform abbreviated key conversion here, for the same * reasons that it isn't used in MergeAppend */ sortKey->abbreviate = false; PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey); } } /* * store the tuple descriptor into gather merge state, so we can use it * later while initializing the gather merge slots. */ if (!ExecContextForcesOids(&gm_state->ps, &hasoid)) hasoid = false; tupDesc = ExecTypeFromTL(outerNode->targetlist, hasoid); gm_state->tupDesc = tupDesc; return gm_state; }