static HeapTuple TupleParserRead(TupleParser *self, Checker *checker) { uint32 len; BULKLOAD_PROFILE(&prof_reader_parser); if (QueueRead(self->queue, &len, sizeof(uint32), false) == sizeof(uint32) && len > 0) { if (self->buflen < len) { self->buffer = repalloc(self->buffer, len); self->buflen = len; } if (QueueRead(self->queue, self->buffer, len, false) == len) { BULKLOAD_PROFILE(&prof_reader_source); self->tuple.t_len = len; self->tuple.t_data = (HeapTupleHeader) self->buffer; return &self->tuple; } } return NULL; }
void SpoolerInsert(Spooler *self, HeapTuple tuple) { /* Spool keys in the tuple */ ExecStoreTuple(tuple, self->slot, InvalidBuffer, false); IndexSpoolInsert(self->spools, self->slot, &(tuple->t_self), self->estate, true); BULKLOAD_PROFILE(&prof_writer_index); }
/* * IndexSpoolEnd - Flush and delete spools. */ void IndexSpoolEnd(Spooler *self, bool reindex) { BTSpool **spools = self->spools; int i; RelationPtr indices = self->relinfo->ri_IndexRelationDescs; Assert(spools != NULL); Assert(self->relinfo != NULL); for (i = 0; i < self->relinfo->ri_NumIndices; i++) { if (spools[i] != NULL) { _bt_mergebuild(self, spools[i]); _bt_spooldestroy(spools[i]); } else if (reindex) { Oid indexOid = RelationGetRelid(indices[i]); /* Close index before reindex to pass CheckTableNotInUse. */ relation_close(indices[i], NoLock); indices[i] = NULL; reindex_index(indexOid, false); CommandCounterIncrement(); BULKLOAD_PROFILE(&prof_reindex); } else { /* We already done using index_insert. */ } } pfree(spools); }
/* * _bt_mergeload - Merge two streams of index tuples into new index files. */ static void _bt_mergeload(Spooler *self, BTWriteState *wstate, BTSpool *btspool, BTReader *btspool2, Relation heapRel) { BTPageState *state = NULL; IndexTuple itup, itup2; bool should_free = false; TupleDesc tupdes = RelationGetDescr(wstate->index); int keysz = RelationGetNumberOfAttributes(wstate->index); ScanKey indexScanKey; ON_DUPLICATE on_duplicate = self->on_duplicate; Assert(btspool != NULL); /* the preparation of merge */ itup = BTSpoolGetNextItem(btspool, NULL, &should_free); itup2 = BTReaderGetNextItem(btspool2); indexScanKey = _bt_mkscankey_nodata(wstate->index); for (;;) { bool load1 = true; /* load BTSpool next ? */ bool hasnull; int32 compare; if (self->dup_old + self->dup_new > self->max_dup_errors) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Maximum duplicate error count exceeded"))); if (itup2 == NULL) { if (itup == NULL) break; } else if (itup != NULL) { compare = compare_indextuple(itup, itup2, indexScanKey, keysz, tupdes, &hasnull); if (compare == 0 && !hasnull && btspool->isunique) { ItemPointerData t_tid2; /* * t_tid is update by heap_is_visible(), because use it for an * index, t_tid backup */ ItemPointerCopy(&itup2->t_tid, &t_tid2); /* The tuple pointed by the old index should not be visible. */ if (!heap_is_visible(heapRel, &itup->t_tid)) { itup = BTSpoolGetNextItem(btspool, itup, &should_free); } else if (!heap_is_visible(heapRel, &itup2->t_tid)) { itup2 = BTReaderGetNextItem(btspool2); } else { if (on_duplicate == ON_DUPLICATE_KEEP_NEW) { self->dup_old++; remove_duplicate(self, heapRel, itup2, RelationGetRelationName(wstate->index)); itup2 = BTReaderGetNextItem(btspool2); } else { ItemPointerCopy(&t_tid2, &itup2->t_tid); self->dup_new++; remove_duplicate(self, heapRel, itup, RelationGetRelationName(wstate->index)); itup = BTSpoolGetNextItem(btspool, itup, &should_free); } } continue; } else if (compare > 0) load1 = false; } else load1 = false; BULKLOAD_PROFILE(&prof_merge_unique); /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); if (load1) { IndexTuple next_itup = NULL; bool next_should_free = false; for (;;) { /* get next item */ next_itup = BTSpoolGetNextItem(btspool, next_itup, &next_should_free); if (!btspool->isunique || next_itup == NULL) break; compare = compare_indextuple(itup, next_itup, indexScanKey, keysz, tupdes, &hasnull); if (compare < 0 || hasnull) break; if (compare > 0) { /* shouldn't happen */ elog(ERROR, "faild in tuplesort_performsort"); } /* * If tupple is deleted by other unique indexes, not visible */ if (!heap_is_visible(heapRel, &next_itup->t_tid)) { continue; } if (!heap_is_visible(heapRel, &itup->t_tid)) { if (should_free) pfree(itup); itup = next_itup; should_free = next_should_free; next_should_free = false; continue; } /* not unique between input files */ self->dup_new++; remove_duplicate(self, heapRel, next_itup, RelationGetRelationName(wstate->index)); if (self->dup_old + self->dup_new > self->max_dup_errors) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Maximum duplicate error count exceeded"))); } _bt_buildadd(wstate, state, itup); if (should_free) pfree(itup); itup = next_itup; should_free = next_should_free; } else { _bt_buildadd(wstate, state, itup2); itup2 = BTReaderGetNextItem(btspool2); } BULKLOAD_PROFILE(&prof_merge_insert); } _bt_freeskey(indexScanKey); /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); /* * If the index isn't temp, we must fsync it down to disk before it's safe * to commit the transaction. (For a temp index we don't care since the * index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index * pages. The reason is that since we're building outside shared buffers, * a CHECKPOINT occurring during the build has no way to flush the * previously written data to disk (indeed it won't know the index even * exists). A crash later on would replay WAL from the checkpoint, * therefore it wouldn't replay our earlier WAL entries. If we do not * fsync those pages here, they might still not be on disk when the crash * occurs. */ if (!RELATION_IS_LOCAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); } BULKLOAD_PROFILE(&prof_merge_term); }
static void _bt_mergebuild(Spooler *self, BTSpool *btspool) { Relation heapRel = self->relinfo->ri_RelationDesc; BTWriteState wstate; BTReader reader; bool merge; Assert(btspool->index->rd_index->indisvalid); tuplesort_performsort(btspool->sortstate); wstate.index = btspool->index; /* * We need to log index creation in WAL iff WAL archiving is enabled AND * it's not a temp index. */ wstate.btws_use_wal = self->use_wal && XLogArchivingActive() && !RELATION_IS_LOCAL(wstate.index); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; wstate.btws_pages_written = 0; wstate.btws_zeropage = NULL; /* until needed */ /* * Flush dirty buffers so that we will read the index files directly * in order to get pre-existing data. We must acquire AccessExclusiveLock * for the target table for calling FlushRelationBuffer(). */ LockRelation(wstate.index, AccessExclusiveLock); FlushRelationBuffers(wstate.index); BULKLOAD_PROFILE(&prof_flush); merge = BTReaderInit(&reader, wstate.index); elog(DEBUG1, "pg_bulkload: build \"%s\" %s merge (%s wal)", RelationGetRelationName(wstate.index), merge ? "with" : "without", wstate.btws_use_wal ? "with" : "without"); /* Assign a new file node. */ RelationSetNewRelfilenode(wstate.index, InvalidTransactionId); if (merge || (btspool->isunique && self->max_dup_errors > 0)) { /* Merge two streams into the new file node that we assigned. */ BULKLOAD_PROFILE_PUSH(); _bt_mergeload(self, &wstate, btspool, &reader, heapRel); BULKLOAD_PROFILE_POP(); BULKLOAD_PROFILE(&prof_merge); } else { /* Fast path for newly created index. */ _bt_load(&wstate, btspool, NULL); BULKLOAD_PROFILE(&prof_index); } BTReaderTerm(&reader); }
/** * @brief Read the next tuple from parser. * @param rd [in/out] reader * @return type */ HeapTuple ReaderNext(Reader *rd) { HeapTuple tuple; MemoryContext ccxt; bool eof; Parser *parser = rd->parser; ccxt = CurrentMemoryContext; eof = false; do { tuple = NULL; parser->parsing_field = -1; PG_TRY(); { tuple = ParserRead(parser, &rd->checker); if (tuple == NULL) eof = true; else { tuple = CheckerTuple(&rd->checker, tuple, &parser->parsing_field); CheckerConstraints(&rd->checker, tuple, &parser->parsing_field); } } PG_CATCH(); { ErrorData *errdata; MemoryContext ecxt; char *message; StringInfoData buf; if (parser->parsing_field < 0) PG_RE_THROW(); /* should not ignore */ ecxt = MemoryContextSwitchTo(ccxt); errdata = CopyErrorData(); /* We cannot ignore query aborts. */ switch (errdata->sqlerrcode) { case ERRCODE_ADMIN_SHUTDOWN: case ERRCODE_QUERY_CANCELED: MemoryContextSwitchTo(ecxt); PG_RE_THROW(); break; } /* Absorb parse errors. */ rd->parse_errors++; if (errdata->message) message = pstrdup(errdata->message); else message = "<no error message>"; FlushErrorState(); FreeErrorData(errdata); initStringInfo(&buf); appendStringInfo(&buf, "Parse error Record " int64_FMT ": Input Record " int64_FMT ": Rejected", rd->parse_errors, parser->count); if (parser->parsing_field > 0) appendStringInfo(&buf, " - column %d", parser->parsing_field); appendStringInfo(&buf, ". %s\n", message); LoggerLog(WARNING, buf.data); /* Terminate if PARSE_ERRORS has been reached. */ if (rd->parse_errors > rd->max_parse_errors) { eof = true; LoggerLog(WARNING, "Maximum parse error count exceeded - " int64_FMT " error(s) found in input file\n", rd->parse_errors); } /* output parse bad file. */ if (rd->parse_fp == NULL) if ((rd->parse_fp = AllocateFile(rd->parse_badfile, "w")) == NULL) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open parse bad file \"%s\": %m", rd->parse_badfile))); ParserDumpRecord(parser, rd->parse_fp, rd->parse_badfile); MemoryContextReset(ccxt); // Without the below line, the regression tests shows the different result on debug-build mode. tuple = NULL; } PG_END_TRY(); } while (!eof && !tuple); BULKLOAD_PROFILE(&prof_reader_parser); return tuple; }
/** * @brief Read one record from input file and transfer literal string to * PostgreSQL internal format. * * Process flow * - If record buffer is empty * + Read records up to READ_LINE_NUM by read(2) * * Return 0 if we reach EOF. * * If an error occurs, notify it to caller by ereport(). * + Count the number of records in the record buffer. * + Initialize the number of used records to 0. * + Store the head byte of the next record. * - If the number of records remained in the record buffer and there is not * enough room, notify it to the caller by ereport(). * - Get back the stored head byte, and store the head byte of the next record. * - Update the number of records used. * @param rd [in/out] Control information * @return Return true if there is a next record, or false if EOF. */ static HeapTuple BinaryParserRead(BinaryParser *self, Checker *checker) { HeapTuple tuple; char *record; int i; /* Skip first offset lines in the input file */ if (unlikely(self->need_offset > 0)) { int i; for (i = 0; i < self->need_offset; i++) { int len; len = SourceRead(self->source, self->buffer, self->rec_len); if (len != self->rec_len) { if (errno == 0) errno = EINVAL; ereport(ERROR, (errcode_for_file_access(), errmsg("could not skip " int64_FMT " lines (" int64_FMT " bytes) in the input file: %m", self->need_offset, self->rec_len * self->need_offset))); } } self->need_offset = 0; } /* * If the record buffer is exhausted, read next records from file * up to READ_LINE_NUM rows at once. */ if (self->used_rec_cnt >= self->total_rec_cnt) { int len; div_t v; BULKLOAD_PROFILE(&prof_reader_parser); while ((len = SourceRead(self->source, self->buffer, self->rec_len * READ_LINE_NUM)) < 0) { if (errno != EAGAIN && errno != EINTR) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read input file: %m"))); } BULKLOAD_PROFILE(&prof_reader_source); /* * Calculate the actual number of rows. Trailing remainder bytes * at the end of the input file are ingored with WARNING. */ v = div(len, self->rec_len); if (v.rem != 0) elog(WARNING, "Ignore %d bytes at the end of file", v.rem); self->total_rec_cnt = v.quot; self->used_rec_cnt = 0; if (self->total_rec_cnt <= 0) return NULL; /* eof */ record = self->buffer; } else { record = self->buffer + (self->rec_len * self->used_rec_cnt); } /* * Increment the position *before* parsing the record so that we can * skip it when there are some errors on parsing it. */ self->used_rec_cnt++; self->base.count++; for (i = 0; i < self->nfield; i++) { /* Convert it to server encoding. */ if (self->fields[i].character) { char *str = record + self->fields[i].offset; int next_head = self->fields[i].offset + self->fields[i].len; self->next_head = record[next_head]; record[next_head] = '\0'; self->base.parsing_field = i + 1; self->fields[i].in = CheckerConversion(checker, str); record[next_head] = self->next_head; } else { self->fields[i].in = record + self->fields[i].offset; } } ExtractValuesFromFixed(self, record); self->next_head = '\0'; self->base.parsing_field = -1; if (self->filter.funcstr) tuple = FilterTuple(&self->filter, &self->former, &self->base.parsing_field); else tuple = TupleFormerTuple(&self->former); return tuple; }
/** * @brief Entry point of the user-defined function for pg_bulkload. * @return Returns number of loaded tuples. If the case of errors, -1 will be * returned. */ Datum pg_bulkload(PG_FUNCTION_ARGS) { Reader *rd = NULL; Writer *wt = NULL; Datum options; MemoryContext ctx; MemoryContext ccxt; PGRUsage ru0; PGRUsage ru1; int64 count; int64 parse_errors; int64 skip; WriterResult ret; char *start; char *end; float8 system; float8 user; float8 duration; TupleDesc tupdesc; Datum values[PG_BULKLOAD_COLS]; bool nulls[PG_BULKLOAD_COLS]; HeapTuple result; /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); BULKLOAD_PROFILE_PUSH(); pg_rusage_init(&ru0); /* must be the super user */ if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to use pg_bulkload"))); options = PG_GETARG_DATUM(0); ccxt = CurrentMemoryContext; /* * STEP 1: Initialization */ /* parse options and create reader and writer */ ParseOptions(options, &rd, &wt, ru0.tv.tv_sec); /* initialize reader */ ReaderInit(rd); /* * We need to split PG_TRY block because gcc optimizes if-branches with * longjmp codes too much. Local variables initialized in either branch * cannot be handled another branch. */ PG_TRY(); { /* truncate heap */ if (wt->truncate) TruncateTable(wt->relid); /* initialize writer */ WriterInit(wt); /* initialize checker */ CheckerInit(&rd->checker, wt->rel, wt->tchecker); /* initialize parser */ ParserInit(rd->parser, &rd->checker, rd->infile, wt->desc, wt->multi_process, PG_GET_COLLATION()); } PG_CATCH(); { if (rd) ReaderClose(rd, true); if (wt) WriterClose(wt, true); PG_RE_THROW(); } PG_END_TRY(); /* No throwable codes here! */ PG_TRY(); { /* create logger */ CreateLogger(rd->logfile, wt->verbose, rd->infile[0] == ':'); start = timeval_to_cstring(ru0.tv); LoggerLog(INFO, "\npg_bulkload %s on %s\n\n", PG_BULKLOAD_VERSION, start); ReaderDumpParams(rd); WriterDumpParams(wt); LoggerLog(INFO, "\n"); BULKLOAD_PROFILE(&prof_init); /* * STEP 2: Build heap */ /* Switch into its memory context */ Assert(wt->context); ctx = MemoryContextSwitchTo(wt->context); /* Loop for each input file record. */ while (wt->count < rd->limit) { HeapTuple tuple; CHECK_FOR_INTERRUPTS(); /* read tuple */ BULKLOAD_PROFILE_PUSH(); tuple = ReaderNext(rd); BULKLOAD_PROFILE_POP(); BULKLOAD_PROFILE(&prof_reader); if (tuple == NULL) break; /* write tuple */ BULKLOAD_PROFILE_PUSH(); WriterInsert(wt, tuple); wt->count += 1; BULKLOAD_PROFILE_POP(); BULKLOAD_PROFILE(&prof_writer); MemoryContextReset(wt->context); BULKLOAD_PROFILE(&prof_reset); } MemoryContextSwitchTo(ctx); /* * STEP 3: Finalize heap and merge indexes */ count = wt->count; parse_errors = rd->parse_errors; /* * close writer first and reader second because shmem_exit callback * is managed by a simple stack. */ ret = WriterClose(wt, false); wt = NULL; skip = ReaderClose(rd, false); rd = NULL; } PG_CATCH(); { ErrorData *errdata; MemoryContext ecxt; ecxt = MemoryContextSwitchTo(ccxt); errdata = CopyErrorData(); LoggerLog(INFO, "%s\n", errdata->message); FreeErrorData(errdata); /* close writer first, and reader second */ if (wt) WriterClose(wt, true); if (rd) ReaderClose(rd, true); MemoryContextSwitchTo(ecxt); PG_RE_THROW(); } PG_END_TRY(); count -= ret.num_dup_new; LoggerLog(INFO, "\n" " " int64_FMT " Rows skipped.\n" " " int64_FMT " Rows successfully loaded.\n" " " int64_FMT " Rows not loaded due to parse errors.\n" " " int64_FMT " Rows not loaded due to duplicate errors.\n" " " int64_FMT " Rows replaced with new rows.\n\n", skip, count, parse_errors, ret.num_dup_new, ret.num_dup_old); pg_rusage_init(&ru1); system = diffTime(ru1.ru.ru_stime, ru0.ru.ru_stime); user = diffTime(ru1.ru.ru_utime, ru0.ru.ru_utime); duration = diffTime(ru1.tv, ru0.tv); end = timeval_to_cstring(ru1.tv); memset(nulls, 0, sizeof(nulls)); values[0] = Int64GetDatum(skip); values[1] = Int64GetDatum(count); values[2] = Int64GetDatum(parse_errors); values[3] = Int64GetDatum(ret.num_dup_new); values[4] = Int64GetDatum(ret.num_dup_old); values[5] = Float8GetDatumFast(system); values[6] = Float8GetDatumFast(user); values[7] = Float8GetDatumFast(duration); LoggerLog(INFO, "Run began on %s\n" "Run ended on %s\n\n" "CPU %.2fs/%.2fu sec elapsed %.2f sec\n", start, end, system, user, duration); LoggerClose(); result = heap_form_tuple(tupdesc, values, nulls); BULKLOAD_PROFILE(&prof_fini); BULKLOAD_PROFILE_POP(); BULKLOAD_PROFILE_PRINT(); PG_RETURN_DATUM(HeapTupleGetDatum(result)); }
/** * @brief Create LoadStatus file and load heap tuples directly. * @return void */ static void DirectWriterInsert(DirectWriter *self, HeapTuple tuple) { Page page; OffsetNumber offnum; ItemId itemId; Item item; LoadStatus *ls = &self->ls; /* Compress the tuple data if needed. */ if (tuple->t_len > TOAST_TUPLE_THRESHOLD) tuple = toast_insert_or_update(self->base.rel, tuple, NULL, 0); BULKLOAD_PROFILE(&prof_writer_toast); /* Assign oids if needed. */ if (self->base.rel->rd_rel->relhasoids) { Assert(!OidIsValid(HeapTupleGetOid(tuple))); HeapTupleSetOid(tuple, GetNewOid(self->base.rel)); } /* Assume the tuple has been toasted already. */ if (MAXALIGN(tuple->t_len) > MaxHeapTupleSize) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("row is too big: size %lu, maximum size %lu", (unsigned long) tuple->t_len, (unsigned long) MaxHeapTupleSize))); /* Fill current page, or go to next page if the page is full. */ page = GetCurrentPage(self); if (PageGetFreeSpace(page) < MAXALIGN(tuple->t_len) + RelationGetTargetPageFreeSpace(self->base.rel, HEAP_DEFAULT_FILLFACTOR)) { if (self->curblk < BLOCK_BUF_NUM - 1) self->curblk++; else { flush_pages(self); self->curblk = 0; /* recycle from first block */ } page = GetCurrentPage(self); /* Initialize current block */ PageInit(page, BLCKSZ, 0); PageSetTLI(page, ThisTimeLineID); } tuple->t_data->t_infomask &= ~(HEAP_XACT_MASK); tuple->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); tuple->t_data->t_infomask |= HEAP_XMAX_INVALID; HeapTupleHeaderSetXmin(tuple->t_data, self->xid); HeapTupleHeaderSetCmin(tuple->t_data, self->cid); HeapTupleHeaderSetXmax(tuple->t_data, 0); /* put the tuple on local page. */ offnum = PageAddItem(page, (Item) tuple->t_data, tuple->t_len, InvalidOffsetNumber, false, true); ItemPointerSet(&(tuple->t_self), LS_TOTAL_CNT(ls) + self->curblk, offnum); itemId = PageGetItemId(page, offnum); item = PageGetItem(page, itemId); ((HeapTupleHeader) item)->t_ctid = tuple->t_self; BULKLOAD_PROFILE(&prof_writer_table); SpoolerInsert(&self->spooler, tuple); BULKLOAD_PROFILE(&prof_writer_index); }