/* ---------------------------------------------------------------- * MultiExecHash * * build hash table for hashjoin, doing partitioning if more * than one batch is required. * ---------------------------------------------------------------- */ Node * MultiExecHash(HashState *node) { PlanState *outerNode; List *hashkeys; HashJoinTable hashtable; TupleTableSlot *slot; ExprContext *econtext; uint32 hashvalue; /* must provide our own instrumentation support */ if (node->ps.instrument) InstrStartNode(node->ps.instrument); /* * get state info from node */ outerNode = outerPlanState(node); hashtable = node->hashtable; /* * set expression context */ hashkeys = node->hashkeys; econtext = node->ps.ps_ExprContext; /* * get all inner tuples and insert into the hash table (or temp files) */ for (;;) { slot = ExecProcNode(outerNode); if (TupIsNull(slot)) break; hashtable->totalTuples += 1; /* We have to compute the hash value */ econtext->ecxt_innertuple = slot; hashvalue = ExecHashGetHashValue(hashtable, econtext, hashkeys); ExecHashTableInsert(hashtable, slot, hashvalue); } /* must provide our own instrumentation support */ if (node->ps.instrument) InstrStopNode(node->ps.instrument, hashtable->totalTuples); /* * We do not return the hash table directly because it's not a subtype of * Node, and so would violate the MultiExecProcNode API. Instead, our * parent Hashjoin node is expected to know how to fish it out of our node * state. Ugly but not really worth cleaning up, since Hashjoin knows * quite a bit more about Hash besides that. */ return NULL; }
/* * BufFileWrite * * Like fwrite() except we assume 1-byte element size. */ size_t BufFileWrite(BufFile *file, void *ptr, size_t size) { size_t nwritten = 0; size_t nthistime; while (size > 0) { if (file->pos >= BLCKSZ) { /* Buffer full, dump it out */ if (file->dirty) { /* by cywang */ InstrStartNode(file->instr_filedump); BufFileDumpBuffer(file); InstrStopNode(file->instr_filedump, 0); file->fileDumpNum++; if (file->dirty) break; /* I/O error */ } else { /* Hmm, went directly from reading to writing? */ file->curOffset += file->pos; file->pos = 0; file->nbytes = 0; } } nthistime = BLCKSZ - file->pos; if (nthistime > size) nthistime = size; Assert(nthistime > 0); memcpy(file->buffer + file->pos, ptr, nthistime); file->dirty = true; file->pos += nthistime; if (file->nbytes < file->pos) file->nbytes = file->pos; ptr = (void *) ((char *) ptr + nthistime); size -= nthistime; nwritten += nthistime; } return nwritten; }
/* * BufFileLoadBuffer * * Load some data into buffer, if possible, starting from curOffset. * At call, must have dirty = false, pos and nbytes = 0. * On exit, nbytes is number of bytes loaded. */ static void BufFileLoadBuffer(BufFile *file) { File thisfile; /* * Advance to next component file if necessary and possible. * * This path can only be taken if there is more than one component, so it * won't interfere with reading a non-temp file that is over * MAX_PHYSICAL_FILESIZE. */ if (file->curOffset >= MAX_PHYSICAL_FILESIZE && file->curFile + 1 < file->numFiles) { file->curFile++; file->curOffset = 0L; } /* * May need to reposition physical file. */ thisfile = file->files[file->curFile]; if (file->curOffset != file->offsets[file->curFile]) { if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset) return; /* seek failed, read nothing */ file->offsets[file->curFile] = file->curOffset; } /* add by cywang */ InstrStartNode(file->instr_fileload); /* * Read whatever we can get, up to a full bufferload. */ file->nbytes = FileRead(thisfile, file->buffer, sizeof(file->buffer)); InstrStopNode(file->instr_fileload, 0); file->fileLoadNum++; if (file->nbytes < 0) file->nbytes = 0; file->offsets[file->curFile] += file->nbytes; /* we choose not to advance curOffset here */ pgBufferUsage.temp_blks_read++; }
/* * BufFileFlush * * Like fflush() */ static int BufFileFlush(BufFile *file) { if (file->dirty) { /* by cywang */ InstrStartNode(file->instr_filedump); BufFileDumpBuffer(file); InstrStopNode(file->instr_filedump, 0); file->fileDumpNum++; if (file->dirty) return EOF; } return 0; }
/* ---------------------------------------------------------------- * MultiExecBitmapIndexScan(node) * ---------------------------------------------------------------- */ Node * MultiExecBitmapIndexScan(BitmapIndexScanState *node) { TIDBitmap *tbm; IndexScanDesc scandesc; double nTuples = 0; bool doscan; /* must provide our own instrumentation support */ if (node->ss.ps.instrument) InstrStartNode(node->ss.ps.instrument); /* * extract necessary information from index scan node */ scandesc = node->biss_ScanDesc; /* * If we have runtime keys and they've not already been set up, do it now. * Array keys are also treated as runtime keys; note that if ExecReScan * returns with biss_RuntimeKeysReady still false, then there is an empty * array key so we should do nothing. */ if (!node->biss_RuntimeKeysReady && (node->biss_NumRuntimeKeys != 0 || node->biss_NumArrayKeys != 0)) { ExecReScan((PlanState *) node); doscan = node->biss_RuntimeKeysReady; } else doscan = true; /* * Prepare the result bitmap. Normally we just create a new one to pass * back; however, our parent node is allowed to store a pre-made one into * node->biss_result, in which case we just OR our tuple IDs into the * existing bitmap. (This saves needing explicit UNION steps.) */ if (node->biss_result) { tbm = node->biss_result; node->biss_result = NULL; /* reset for next time */ } else { /* XXX should we use less than work_mem for this? */ tbm = tbm_create(work_mem * 1024L); } /* * Get TIDs from index and insert into bitmap */ while (doscan) { nTuples += (double) index_getbitmap(scandesc, tbm); CHECK_FOR_INTERRUPTS(); doscan = ExecIndexAdvanceArrayKeys(node->biss_ArrayKeys, node->biss_NumArrayKeys); if (doscan) /* reset index scan */ index_rescan(node->biss_ScanDesc, node->biss_ScanKeys, node->biss_NumScanKeys, NULL, 0); } /* must provide our own instrumentation support */ if (node->ss.ps.instrument) InstrStopNode(node->ss.ps.instrument, nTuples); return (Node *) tbm; }
/* ---------------------------------------------------------------- * MultiExecHash * * build hash table for hashjoin, doing partitioning if more * than one batch is required. * ---------------------------------------------------------------- */ Node * MultiExecHash(HashState *node) { PlanState *outerNode; List *hashkeys; HashJoinTable hashtable; TupleTableSlot *slot; ExprContext *econtext; uint32 hashvalue = 0; /* must provide our own instrumentation support */ if (node->ps.instrument) InstrStartNode(node->ps.instrument); /* * get state info from node */ outerNode = outerPlanState(node); hashtable = node->hashtable; /* * set expression context */ hashkeys = node->hashkeys; econtext = node->ps.ps_ExprContext; #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( MultiExecHashLargeVmem, DDLNotSpecified, "", // databaseName ""); // tableName #endif /* * get all inner tuples and insert into the hash table (or temp files) */ for (;;) { slot = ExecProcNode(outerNode); if (TupIsNull(slot)) break; Gpmon_M_Incr(GpmonPktFromHashState(node), GPMON_QEXEC_M_ROWSIN); CheckSendPlanStateGpmonPkt(&node->ps); /* We have to compute the hash value */ econtext->ecxt_innertuple = slot; bool hashkeys_null = false; if (ExecHashGetHashValue(node, hashtable, econtext, hashkeys, false, node->hs_keepnull, &hashvalue, &hashkeys_null)) { ExecHashTableInsert(node, hashtable, slot, hashvalue); } if (hashkeys_null) { node->hs_hashkeys_null = true; if (node->hs_quit_if_hashkeys_null) { ExecSquelchNode(outerNode); return NULL; } } } /* Now we have set up all the initial batches & primary overflow batches. */ hashtable->nbatch_outstart = hashtable->nbatch; /* must provide our own instrumentation support */ if (node->ps.instrument) InstrStopNode(node->ps.instrument, hashtable->totalTuples); /* * We do not return the hash table directly because it's not a subtype of * Node, and so would violate the MultiExecProcNode API. Instead, our * parent Hashjoin node is expected to know how to fish it out of our node * state. Ugly but not really worth cleaning up, since Hashjoin knows * quite a bit more about Hash besides that. */ return NULL; }
/* ---------------------------------------------------------------- * MultiExecHash * * build hash table for hashjoin, doing partitioning if more * than one batch is required. * ---------------------------------------------------------------- */ Node * MultiExecHash(HashState *node) { PlanState *outerNode; List *hashkeys; HashJoinTable hashtable; TupleTableSlot *slot; ExprContext *econtext; uint32 hashvalue; /* must provide our own instrumentation support */ if (node->ps.instrument) InstrStartNode(node->ps.instrument); /* * get state info from node */ outerNode = outerPlanState(node); hashtable = node->hashtable; /* * set expression context */ hashkeys = node->hashkeys; econtext = node->ps.ps_ExprContext; /* * get all inner tuples and insert into the hash table (or temp files) */ for (;;) { slot = ExecProcNode(outerNode); if (TupIsNull(slot)) break; /* We have to compute the hash value */ econtext->ecxt_innertuple = slot; if (ExecHashGetHashValue(hashtable, econtext, hashkeys, false, hashtable->keepNulls, &hashvalue)) { int bucketNumber; bucketNumber = ExecHashGetSkewBucket(hashtable, hashvalue); if (bucketNumber != INVALID_SKEW_BUCKET_NO) { /* It's a skew tuple, so put it into that hash table */ ExecHashSkewTableInsert(hashtable, slot, hashvalue, bucketNumber); } else { /* Not subject to skew optimization, so insert normally */ ExecHashTableInsert(hashtable, slot, hashvalue); } hashtable->totalTuples += 1; } } /* must provide our own instrumentation support */ if (node->ps.instrument) InstrStopNode(node->ps.instrument, hashtable->totalTuples); /* * We do not return the hash table directly because it's not a subtype of * Node, and so would violate the MultiExecProcNode API. Instead, our * parent Hashjoin node is expected to know how to fish it out of our node * state. Ugly but not really worth cleaning up, since Hashjoin knows * quite a bit more about Hash besides that. */ return NULL; }
/* ---------------------------------------------------------------- * MultiExecBitmapIndexScan(node) * ---------------------------------------------------------------- */ Node * MultiExecBitmapIndexScan(BitmapIndexScanState *node) { IndexScanState *scanState = (IndexScanState*)node; Node *bitmap = NULL; /* must provide our own instrumentation support */ if (scanState->ss.ps.instrument) { InstrStartNode(scanState->ss.ps.instrument); } bool partitionIsReady = DynamicScan_BeginIndexPartition(scanState, false /* initQual */, false /* initTargetList */, true /* supportsArrayKeys */, true /* isMultiScan */); Assert(partitionIsReady); if (!partitionIsReady) { DynamicScan_EndIndexPartition(scanState); return NULL; } bool doscan = node->indexScanState.iss_RuntimeKeysReady; IndexScanDesc scandesc = scanState->iss_ScanDesc; /* Get bitmap from index */ while (doscan) { bitmap = index_getmulti(scandesc, node->bitmap); if ((NULL != bitmap) && !(IsA(bitmap, HashBitmap) || IsA(bitmap, StreamBitmap))) { elog(ERROR, "unrecognized result from bitmap index scan"); } CHECK_FOR_INTERRUPTS(); /* CDB: If EXPLAIN ANALYZE, let bitmap share our Instrumentation. */ if (scanState->ss.ps.instrument) { tbm_bitmap_set_instrument(bitmap, scanState->ss.ps.instrument); } if(node->bitmap == NULL) { node->bitmap = (Node *)bitmap; } doscan = ExecIndexAdvanceArrayKeys(scanState->iss_ArrayKeys, scanState->iss_NumArrayKeys); if (doscan) { /* reset index scan */ index_rescan(scanState->iss_ScanDesc, scanState->iss_ScanKeys); } } DynamicScan_EndIndexPartition(scanState); /* must provide our own instrumentation support */ if (scanState->ss.ps.instrument) { InstrStopNode(scanState->ss.ps.instrument, 1 /* nTuples */); } return (Node *) bitmap; }
/* * RouterExecutorRun actually executes a single task on a worker. */ void RouterExecutorRun(QueryDesc *queryDesc, ScanDirection direction, long count, Task *task) { EState *estate = queryDesc->estate; CmdType operation = queryDesc->operation; MemoryContext oldcontext = NULL; DestReceiver *destination = queryDesc->dest; MaterialState *routerState = (MaterialState *) queryDesc->planstate; bool sendTuples = operation == CMD_SELECT || queryDesc->plannedstmt->hasReturning; Assert(estate != NULL); Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY)); Assert(task != NULL); /* we only support default scan direction and row fetch count */ if (!ScanDirectionIsForward(direction)) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("scan directions other than forward scans " "are unsupported"))); } oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); if (queryDesc->totaltime != NULL) { InstrStartNode(queryDesc->totaltime); } estate->es_processed = 0; /* startup the tuple receiver */ if (sendTuples) { (*destination->rStartup)(destination, operation, queryDesc->tupDesc); } /* * If query has not yet been executed, do so now. The main reason why the * query might already have been executed is cursors. */ if (!routerState->eof_underlying) { bool resultsOK = false; bool isModificationQuery = false; if (operation == CMD_INSERT || operation == CMD_UPDATE || operation == CMD_DELETE) { isModificationQuery = true; } else if (operation != CMD_SELECT) { ereport(ERROR, (errmsg("unrecognized operation code: %d", (int) operation))); } resultsOK = ExecuteTaskAndStoreResults(queryDesc, task, isModificationQuery, sendTuples); if (!resultsOK) { ereport(ERROR, (errmsg("could not receive query results"))); } /* mark underlying query as having executed */ routerState->eof_underlying = true; } /* if the underlying query produced output, return it */ if (routerState->tuplestorestate != NULL) { TupleDesc resultTupleDescriptor = queryDesc->tupDesc; int64 returnedRows = 0; /* return rows from the tuplestore */ returnedRows = ReturnRowsFromTuplestore(count, resultTupleDescriptor, destination, routerState->tuplestorestate); /* * Count tuples processed, if this is a SELECT. (For modifications * it'll already have been increased, as we want the number of * modified tuples, not the number of RETURNed tuples.) */ if (operation == CMD_SELECT) { estate->es_processed += returnedRows; } } /* shutdown tuple receiver, if we started it */ if (sendTuples) { (*destination->rShutdown)(destination); } if (queryDesc->totaltime != NULL) { InstrStopNode(queryDesc->totaltime, estate->es_processed); } MemoryContextSwitchTo(oldcontext); }