int main(int argc, char **argv) { char filename[100]; unsigned int data_size; unsigned int num_runs; /* 这轮迭代时有多少个归并段 */ unsigned int num_merges; /* 这轮迭代后产生多少个归并段 num_merges = num_runs/K */ unsigned int run_length; /* 归并段的长度,指数级增长 */ unsigned int num_runs_in_merge; /* 一般每个merge由K个runs合并而来,但最后一个merge可能少于K个runs */ int fd, rv, i, j, bytes; struct stat sbuf; if (argc != 3) { usage(); return 0; } long start_usecs = get_time_usecs(); strcpy(filename, argv[1]); fd = open(filename, O_RDONLY); if (fd < 0) { printf("can't open file %s\n", filename); exit(0); } rv = fstat(fd, &sbuf); data_size = sbuf.st_size; K = atoi(argv[2]); PAGE_SIZE = 4096; /* page = 4KB */ BUF_PAGES = 32; BUF_SIZE = PAGE_SIZE*BUF_PAGES; num_runs = data_size / PAGE_SIZE; /* 初始时的归并段数量,每个归并段有4096 byte, 即1024个整数 */ buffer = (int *)malloc(BUF_SIZE); run_length = 1; run_t **runs = (run_t **)malloc(sizeof(run_t *)*(K+1)); for (i = 0; i < K; i++) { runs[i] = (run_t *)malloc(sizeof(run_t)); runs[i]->buf = (int *)calloc(1, BUF_SIZE+4); } while (num_runs > 1) { num_merges = num_runs / K; int left_runs = num_runs % K; if(left_runs > 0) num_merges++; for (i = 0; i < num_merges; i++) { num_runs_in_merge = K; if ((i+1) == num_merges && left_runs > 0) { num_runs_in_merge = left_runs; } int base = 0; printf("Merge %d of %d,%d ways\n", i, num_merges, num_runs_in_merge); for (j = 0; j < num_runs_in_merge; j++) { if (run_length == 1) { base = 1; bytes = read(fd, runs[j]->buf, PAGE_SIZE); runs[j]->length = bytes/sizeof(int); quick_sort(runs[j]->buf, 0, runs[j]->length-1); } else { snprintf(filename, 20, "%s%d.dat", input_prefix, i*K+j); int infd = open(filename, O_RDONLY); bytes = read(infd, runs[j]->buf, BUF_SIZE); runs[j]->length = bytes/sizeof(int); close(infd); } runs[j]->idx = 0; runs[j]->offset = bytes; } k_merge(runs, input_prefix, num_runs_in_merge, base, i); } strcpy(filename, output_prefix); strcpy(output_prefix, input_prefix); strcpy(input_prefix, filename); run_length *= K; num_runs = num_merges; } for (i = 0; i < K; i++) { free(runs[i]->buf); free(runs[i]); } free(runs); free(buffer); close(fd); long end_usecs = get_time_usecs(); double secs = (double)(end_usecs - start_usecs) / (double)1000000; printf("Sorting took %.02f seconds.\n", secs); printf("sorting result saved in %s%d.dat.\n", input_prefix, 0); return 0; }
int main(int argc, char* argv[]) { if(argc != 4) { printf("Invalid number of arguments (%d)\n", argc); usage(); } DATA_FILENAME = argv[1]; BUF_SIZE = atoi(argv[2])*(1<<20); // convert from MB to bytes IO_BUF_PAGES = atoi(argv[3]); int sort_fd = open(DATA_FILENAME, O_RDONLY); if(sort_fd < 0) { error("Could not open sort_fd!"); } // Get the data file's size struct stat s; int rv = fstat(sort_fd, &s); if(rv) { printf("%s\n", DATA_FILENAME); error("Could not fstat file!"); } DATA_SIZE = s.st_size; char* filename = (char*)calloc(1, 100); // initially, each run is just 1 page, so this is how many runs there are int run_length = 1; unsigned long num_runs = DATA_SIZE / PAGE_SIZE; if(DATA_SIZE % PAGE_SIZE) { num_runs++; } printf("Sorting %lu ints (%.02f MB)...\n", num_runs*INTS_SIZE, (double)num_runs*PAGE_SIZE / (1<<20)); char input_prefix[] = "foo_"; char output_prefix[] = "bar_"; long start_usecs = get_time_usecs(); while(num_runs > 1) { printf("Iterate: %lu runs left\n", num_runs); // Number of ways we can merge at once unsigned long num_ways = ((BUF_SIZE / PAGE_SIZE) / IO_BUF_PAGES) - 1; // Case for small sorts: fits in memory if(num_ways > num_runs) { num_ways = num_runs; } // Calculate how many multimerges need to be done to merge all runs unsigned long num_merges = num_runs / num_ways; if(num_runs % num_ways) { num_merges++; } unsigned long run_counter = 0; // Start iterating the multimerges for(int i=0; i<num_merges; i++) { printf("Merge %d of %lu (%lu ways)\n", i+1, num_merges, num_ways); // Normal case: merge n-ways unsigned long num_runs_in_merge = num_ways; // Remainder case: merge num_runs % num_ways if(run_counter >= (num_runs/num_ways)*num_ways) { num_runs_in_merge = num_runs % num_ways; } // Allocate the runs and runfds we're using for this multimerge run_t** runs = (run_t**)calloc(num_runs_in_merge, sizeof(run_t*)); // Handle the first pass differently, since the numbers // are all coming out of the same file, we need to populate // the runs here instead of letting multimerge do it if(run_length == 1) { // Allocate different sized runs PRINTF("Base case: init and sorting pages\n"); for(int i=0; i<num_runs_in_merge; i++) { runs[i] = (run_t*)calloc(1, sizeof(run_t)); runs[i]->items = (int*)calloc(1, PAGE_SIZE); int bytes = read(sort_fd, runs[i]->items, PAGE_SIZE); runs[i]->length = bytes/sizeof(int); if(bytes == 0) { break; } } run_counter += num_runs_in_merge; PRINTF("Doing parallel qsort..."); parallel_qsort(runs, num_runs_in_merge); PRINTF("done!\n"); // Verify the quicksort #ifdef DEBUG PRINTF("Verifying qsort results..."); for(int i=0; i<num_runs_in_merge; i++) { int temp = 0; int init = 0; for(int j=0; j<runs[i]->length; j++) { if(!init) { temp = runs[i]->items[j]; init = 1; } if(runs[i]->items[j] < temp) { printf("Incorrect qsort! Run %d idx %d\n", i, j); exit(-1); } temp = runs[i]->items[j]; } } PRINTF("correct!\n"); #endif } else { for(int i=0; i<num_runs_in_merge; i++) { snprintf(filename, 100, "%s%lu.dat", input_prefix, run_counter); PRINTF("Reading in: %s\n", filename); run_counter++; } } snprintf(filename, 100, "%s%d.dat", output_prefix, i); PRINTF("Writing out: %s\n", filename); int output_fd = open(filename, O_CREAT|O_WRONLY|O_TRUNC, S_IRWXU|S_IRWXG); // Merge them together int base = 0; if(run_length == 1) { base = 1; } multimerge(runs, input_prefix, num_runs_in_merge, output_fd, base); close(output_fd); // Close and free runs / fds // Free the allocated runs if base case if(base == 1) { for(int i=0; i<num_runs_in_merge; i++) { free(runs[i]->items); free(runs[i]); } } free(runs); } // Swap the input and output prefixes // This way the next merge uses the previous round's output // as input strcpy(filename, output_prefix); strcpy(output_prefix, input_prefix); strcpy(input_prefix, filename); // RUN_LENGTH increases by N_WAYS every iteration run_length *= num_ways; // Number of runs produced is one per merge, i.e. num_merges num_runs = num_merges; } long end_usecs = get_time_usecs(); double secs = (double)(end_usecs - start_usecs) / (double)1000000; printf("Done sorting.\n"); printf("Sorting took %.02f seconds.\n", secs); #ifdef DEBUG snprintf(filename, 100, "%s%d.dat", input_prefix, 0); verify(filename); #endif free(filename); return 0; }