/* * mergeFiles(temp_file_idx) * * Merge the temporary files numbered from 0 to 'temp_file_idx' * inclusive into the output file 'out_ios', maintaining sorted * order. Exits the application if an error occurs. */ static void mergeFiles( int temp_file_idx) { char errbuf[2 * PATH_MAX]; skstream_t *fps[MAX_MERGE_FILES]; uint8_t recs[MAX_MERGE_FILES][NODE_SIZE]; uint8_t lowest_rec[NODE_SIZE]; int j; uint16_t open_count; uint16_t i; uint16_t lowest; uint16_t *top_heap; int tmp_idx_a; int tmp_idx_b; skstream_t *fp_intermediate = NULL; int tmp_idx_intermediate; skheap_t *heap; uint32_t heap_count; int opened_all_temps = 0; ssize_t rv; /* the index of the first temp file to the merge */ tmp_idx_a = 0; TRACEMSG(("Merging #%d through #%d to '%s'", tmp_idx_a, temp_file_idx, skStreamGetPathname(out_rwios))); heap = skHeapCreate2(compHeapNodes, MAX_MERGE_FILES, sizeof(uint16_t), NULL, recs); if (NULL == heap) { skAppPrintOutOfMemory("heap"); appExit(EXIT_FAILURE); } /* This loop repeats as long as we haven't read all of the temp * files generated in the qsort stage. */ do { assert(SKHEAP_ERR_EMPTY==skHeapPeekTop(heap,(skheapnode_t*)&top_heap)); /* the index of the list temp file to merge */ tmp_idx_b = temp_file_idx; /* open an intermediate temp file. The merge-sort will have * to write records here if there are not enough file handles * available to open all the existing tempoary files. */ fp_intermediate = skTempFileCreateStream(tmpctx, &tmp_idx_intermediate); if (fp_intermediate == NULL) { skAppPrintSyserror("Error creating new temporary file"); appExit(EXIT_FAILURE); } /* count number of files we open */ open_count = 0; /* Attempt to open up to MAX_MERGE_FILES, though we an open * may fail due to lack of resources (EMFILE or ENOMEM) */ for (j = tmp_idx_a; j <= tmp_idx_b; ++j) { fps[open_count] = skTempFileOpenStream(tmpctx, j); if (fps[open_count] == NULL) { if ((open_count > 0) && ((errno == EMFILE) || (errno == ENOMEM))) { /* We cannot open any more files. Rewind counter * by one to catch this file on the next merge */ tmp_idx_b = j - 1; TRACEMSG((("FILE limit hit--" "merging #%d through #%d into #%d: %s"), tmp_idx_a, tmp_idx_b, tmp_idx_intermediate, strerror(errno))); break; } else { skAppPrintSyserror(("Error opening existing" " temporary file '%s'"), skTempFileGetName(tmpctx, j)); appExit(EXIT_FAILURE); } } /* read the first record */ rv = skStreamRead(fps[open_count], recs[open_count], NODE_SIZE); if (NODE_SIZE == rv) { /* insert the file index into the heap */ skHeapInsert(heap, &open_count); ++open_count; if (open_count == MAX_MERGE_FILES) { /* We've reached the limit for this pass. Set * tmp_idx_b to the file we just opened. */ tmp_idx_b = j; TRACEMSG((("MAX_MERGE_FILES limit hit--" "merging #%d through #%d to #%d"), tmp_idx_a, tmp_idx_b, tmp_idx_intermediate)); break; } } else if (0 == rv) { TRACEMSG(("Ignoring empty temporary file '%s'", skTempFileGetName(tmpctx, j))); skStreamDestroy(&fps[open_count]); } else { if (rv > 0) { snprintf(errbuf, sizeof(errbuf), "Short read %" SK_PRIdZ "/%" PRIu32 " from '%s'", rv, NODE_SIZE, skStreamGetPathname(fps[open_count])); } else { skStreamLastErrMessage( fps[open_count], rv, errbuf, sizeof(errbuf)); } skAppPrintErr( "Error reading first record from temporary file: %s", errbuf); appExit(EXIT_FAILURE); } } /* Here, we check to see if we've opened all temp files. If * so, set a flag so we write data to final destination and * break out of the loop after we're done. */ if (tmp_idx_b == temp_file_idx) { opened_all_temps = 1; /* no longer need the intermediate temp file */ skStreamDestroy(&fp_intermediate); } else { /* we could not open all temp files, so merge all opened * temp files into the intermediate file. Add the * intermediate file to the list of files to merge */ temp_file_idx = tmp_idx_intermediate; } TRACEMSG((("Merging %" PRIu16 " temporary files"), open_count)); heap_count = skHeapGetNumberEntries(heap); assert(heap_count == open_count); /* get the index of the file with the lowest record; which is * at the top of the heap */ if (skHeapPeekTop(heap, (skheapnode_t*)&top_heap) != SKHEAP_OK) { skAppPrintErr("Unable to open and read any temporary files."); appExit(EXIT_FAILURE); } lowest = *top_heap; /* exit this do...while() once all records for all opened * files have been read */ do { /* lowest_rec is the record pointed to by the index at the * top of the heap */ memcpy(lowest_rec, recs[lowest], NODE_SIZE); /* write the record */ if (fp_intermediate) { /* write the record to intermediate tmp file */ rv = skStreamWrite(fp_intermediate, lowest_rec, NODE_SIZE); if (NODE_SIZE != rv) { skAppPrintSyserror( "Error writing record to temporary file '%s'", skTempFileGetName(tmpctx, tmp_idx_intermediate)); appExit(EXIT_FAILURE); } } else { /* we successfully opened all (remaining) temp files, * write to record to the final destination */ rv = skStreamWriteRecord(out_rwios, (rwRec*)lowest_rec); if (0 != rv) { skStreamPrintLastErr(out_rwios, rv, &skAppPrintErr); if (SKSTREAM_ERROR_IS_FATAL(rv)) { appExit(EXIT_FAILURE); } } } /* replace the record we just processed and loop over all * files until we get a record that is not a duplicate */ do { if ((rv = skStreamRead(fps[lowest], recs[lowest], NODE_SIZE)) != NODE_SIZE) { /* read failed. there is no more data for this * file; remove it from the heap; if the heap is * empty, exit the loop */ skHeapExtractTop(heap, NULL); --heap_count; #if TRACEMSG_LEVEL > 0 if (rv == 0) { TRACEMSG( ("Finished reading file #%u: EOF; %u files remain", (tmp_idx_a + lowest), heap_count)); } else if (rv > 0) { TRACEMSG( ("Finished reading file #%u: Short read " "%" SK_PRIdZ "/%" PRIu32 "; %u files remain", tmp_idx_a + lowest, rv, NODE_SIZE, heap_count)); } else { skStreamLastErrMessage( fps[open_count], rv, errbuf, sizeof(errbuf)); TRACEMSG( ("Finished reading file #%u: %s; %u files remain", (tmp_idx_a + lowest), errbuf, heap_count)); } #endif /* TRACEMSG_LEVEL */ if (0 == heap_count) { break; } } else if (rwrecCompare(lowest_rec, recs[lowest])) { /* read succeeded. new record is not a * duplicate and we insert it into the heap */ /* FIXME: This comparison reduces work when the * keys are the same, but it adds another * comparison when the keys are different; is this * an overall win or lose? */ skHeapReplaceTop(heap, &lowest, NULL); } else { /* read succeeded. record is a duplicate; ignore * the record and leave the heap unchanged */ continue; } /* get the record at the top of the heap and see if it * is a duplicate; if it is, ignore it. */ skHeapPeekTop(heap, (skheapnode_t*)&top_heap); lowest = *top_heap; } while (0 == rwrecCompare(lowest_rec, recs[lowest])); } while (heap_count > 0); TRACEMSG((("Finished processing #%d through #%d"), tmp_idx_a, tmp_idx_b)); /* Close all open temp files */ for (i = 0; i < open_count; ++i) { skStreamDestroy(&fps[i]); } /* Delete all temp files we opened (or attempted to open) this * time */ for (j = tmp_idx_a; j <= tmp_idx_b; ++j) { skTempFileRemove(tmpctx, j); } /* Close the intermediate temp file. */ if (fp_intermediate) { rv = skStreamClose(fp_intermediate); if (rv) { skStreamLastErrMessage( fp_intermediate, rv, errbuf, sizeof(errbuf)); skAppPrintErr("Error closing temporary file: %s", errbuf); skStreamDestroy(&fp_intermediate); appExit(EXIT_FAILURE); } skStreamDestroy(&fp_intermediate); } /* Start the next merge with the next input temp file */ tmp_idx_a = tmp_idx_b + 1; } while (!opened_all_temps); skHeapFree(heap); }
int main(void) { #define DATA_SIZE 15 skheap_t *heap; int data[2*DATA_SIZE] = { 201, 34, 202, 56, 203, 2, 204, 65, 205, 3, 206, 5, 207, 8, 208, 74, 209, 32, 210, 78, 211, 79, 212, 80, 213, 5, 214, 5, 215, 1}; int heaps_data[2*DATA_SIZE]; int i; int j; int *iptr; uint32_t k; int* top; int top_value[2]; int status; int replace_tested = 0; int heap_init_size = 10; skheapiterator_t *iter_down; skheapiterator_t *iter_up; /* first run uses a heap where caller provides the memory */ heap = skHeapCreate(&compare, heap_init_size, 2*sizeof(int), (skheapnode_t*)heaps_data); if (NULL == heap) { printf("Cannot create heap\n"); exit(EXIT_FAILURE); } for (i = 0, iptr = data; i < DATA_SIZE; ++i, iptr += 2) { if (*iptr == 206) { continue; } printf("\n** adding %d/%d...", data[2*i], data[2*i+1]); status = skHeapInsert(heap, (skheapnode_t)iptr); if (SKHEAP_OK == status) { printf("OK\n"); } else if (SKHEAP_ERR_FULL != status) { printf("NOPE. Got wierd error status %d\n", status); } else { printf("NOPE. Heap full. Comparing with the top.\n"); skHeapPeekTop(heap, (skheapnode_t*)&top); if (0 >= compare(top, iptr)) { printf("Not added to heap since <= top (%d/%d) [%d]\n", *top, *(top+1), compare(top, iptr)); } else if (!replace_tested) { replace_tested = 1; printf("Replacing top of heap (%d/%d)...", *top, *(top+1)); if (skHeapReplaceTop(heap, (skheapnode_t)iptr, NULL) == SKHEAP_OK) { printf("OK\n"); } else { printf("Problem adding '%d/%d' to heap\n", data[2*i], data[2*i+1]); } } else { printf("Removing top of heap (%d/%d)...", *top, *(top+1)); skHeapExtractTop(heap, NULL); if (SKHEAP_OK == skHeapInsert(heap, (skheapnode_t)iptr)) { printf("OK\n"); } else { printf("Problem adding '%d/%d' to heap\n", data[2*i], data[2*i+1]); } } } printf("heap %d/%d\n", skHeapGetNumberEntries(heap), skHeapGetCapacity(heap)); for (k = 0; k < skHeapGetNumberEntries(heap); ++k) { printf("%5d %d/%d\n", k, heaps_data[2*k], heaps_data[2*k+1]); } if (i == 0) { printf("\n** Sorting the heap..."); if (SKHEAP_OK == skHeapSortEntries(heap)) { printf("OK\n"); } } } printf("\n** Sorting the heap..."); if (SKHEAP_OK == skHeapSortEntries(heap)) { printf("OK\n"); } else { printf("Got error\n"); } printf("heap %d/%d\n", skHeapGetNumberEntries(heap), skHeapGetCapacity(heap)); for (k = 0; k < skHeapGetNumberEntries(heap); ++k) { printf("%5d %d/%d\n", k, heaps_data[2*k], heaps_data[2*k+1]); } printf("\n** Iterating over the heap...\n"); iter_down = skHeapIteratorCreate(heap, 1); iter_up = skHeapIteratorCreate(heap, -1); while (skHeapIteratorNext(iter_down, (skheapnode_t*)&iptr) == SKHEAP_OK) { printf("Down: %d/%d\t\t", *iptr, *(iptr+1)); skHeapIteratorNext(iter_up, (skheapnode_t*)&iptr); printf("Up: %d/%d\n", *iptr, *(iptr+1)); } skHeapIteratorFree(iter_down); skHeapIteratorFree(iter_up); printf("\n** Removing sorted data from the heap:\n"); while (SKHEAP_OK == skHeapExtractTop(heap, (skheapnode_t)top_value)) { printf("%d/%d\n", top_value[0], top_value[1]); } skHeapFree(heap); /* second run uses a heap where heap manages its own memory */ printf("\n** Creating growable heap with initial size %d...", heap_init_size); heap = skHeapCreate2(&compare2, heap_init_size, sizeof(int), NULL, &cmpfun_data); if (NULL == heap) { printf("Cannot create heap\n"); exit(EXIT_FAILURE); } printf("OK\n"); #define REPEATS 4 for (j = 0; j < REPEATS; ++j) { printf("\n** Inserting %d entries...", (int)(sizeof(data)/sizeof(data[0]))); for (i = 0, iptr = data; i < (int)(sizeof(data)/sizeof(data[0])); ++i, ++iptr) { status = skHeapInsert(heap, (skheapnode_t)iptr); if (SKHEAP_OK == status) { /* okay */ } else if (SKHEAP_ERR_FULL != status) { printf("NOPE. Got wierd error status %d\n", status); } else { printf("NOPE. Heap full. Contains %d entries\n", skHeapGetCapacity(heap)); } } printf("OK\n"); printf("heap %d/%d\n", skHeapGetNumberEntries(heap), skHeapGetCapacity(heap)); } skHeapSortEntries(heap); printf("\n** Removing data from the heap..."); j = skHeapGetNumberEntries(heap); i = 0; while (SKHEAP_OK == skHeapExtractTop(heap, (skheapnode_t)top_value)) { /* printf("%d\n", top_value[0]); */ ++i; } printf("got %d entries\n", i); if (i != j) { printf("error extracting from heap: expected %d; got %d\n", j, i); } i = skHeapGetNumberEntries(heap); if (i != 0) { printf("error in heap contents: expected 0; got %d\n", i); } skHeapFree(heap); exit(0); }