예제 #1
0
파일: outsort.c 프로젝트: autisyu/mygit
int main(int argc, char **argv)
{
	char 				filename[100];
	unsigned int	data_size;
	unsigned int 	num_runs;				/* 这轮迭代时有多少个归并段 */
	unsigned int	num_merges;				/* 这轮迭代后产生多少个归并段 num_merges = num_runs/K */
	unsigned int	run_length;				/* 归并段的长度,指数级增长 */
	unsigned int	num_runs_in_merge;		/* 一般每个merge由K个runs合并而来,但最后一个merge可能少于K个runs */
	int					fd, rv, i, j, bytes;
	struct stat 		sbuf;

	if (argc != 3) {
		usage();
		return 0;
	}
	long start_usecs = get_time_usecs();

	strcpy(filename, argv[1]);
	fd = open(filename, O_RDONLY);
	if (fd < 0) {
		printf("can't open file %s\n", filename);
		exit(0);
	}
	rv = fstat(fd, &sbuf);
	data_size = sbuf.st_size;

	K = atoi(argv[2]);
	PAGE_SIZE = 4096;							/* page = 4KB */
	BUF_PAGES = 32;
	BUF_SIZE = PAGE_SIZE*BUF_PAGES;
	num_runs = data_size / PAGE_SIZE;			/* 初始时的归并段数量,每个归并段有4096 byte, 即1024个整数 */
	buffer = (int *)malloc(BUF_SIZE);

	run_length = 1;
	run_t **runs = (run_t **)malloc(sizeof(run_t *)*(K+1));
	for (i = 0; i < K; i++) {
		runs[i] = (run_t *)malloc(sizeof(run_t));
		runs[i]->buf = (int *)calloc(1, BUF_SIZE+4);
	}
	while (num_runs > 1) {
		num_merges = num_runs / K;
		int left_runs = num_runs % K;
		if(left_runs > 0) num_merges++;
		for (i = 0; i < num_merges; i++) {
			num_runs_in_merge = K;
			if ((i+1) == num_merges && left_runs > 0) {
				num_runs_in_merge = left_runs;
			}
			int base = 0;
			printf("Merge %d of %d,%d ways\n", i, num_merges, num_runs_in_merge);
			for (j = 0; j < num_runs_in_merge; j++) {
				if (run_length == 1) {
					base = 1;
					bytes = read(fd, runs[j]->buf, PAGE_SIZE);
					runs[j]->length = bytes/sizeof(int);
					quick_sort(runs[j]->buf, 0, runs[j]->length-1);
				} else {
					snprintf(filename, 20, "%s%d.dat", input_prefix, i*K+j);
					int infd = open(filename, O_RDONLY);
					bytes = read(infd, runs[j]->buf, BUF_SIZE);
					runs[j]->length = bytes/sizeof(int);
					close(infd);	
				}
				runs[j]->idx = 0;
				runs[j]->offset = bytes;
			}
			k_merge(runs, input_prefix, num_runs_in_merge, base, i);
		}

		strcpy(filename, output_prefix);
		strcpy(output_prefix, input_prefix);
		strcpy(input_prefix, filename);

		run_length *= K;
		num_runs = num_merges;
	}

	for (i = 0; i < K; i++) {
		free(runs[i]->buf);
		free(runs[i]);
	}
	free(runs);
	free(buffer);
	close(fd);

	long end_usecs = get_time_usecs();
	double secs = (double)(end_usecs - start_usecs) / (double)1000000;
	printf("Sorting took %.02f seconds.\n", secs);
	printf("sorting result saved in %s%d.dat.\n", input_prefix, 0);

	return 0;
}
예제 #2
0
int main(int argc, char* argv[])
{
    if(argc != 4) {
        printf("Invalid number of arguments (%d)\n", argc);
        usage();
    }
    DATA_FILENAME = argv[1];
    BUF_SIZE = atoi(argv[2])*(1<<20); // convert from MB to bytes
    IO_BUF_PAGES = atoi(argv[3]);

    int sort_fd = open(DATA_FILENAME, O_RDONLY);
    if(sort_fd < 0) {
        error("Could not open sort_fd!");
    }

    // Get the data file's size
    struct stat s;
    int rv = fstat(sort_fd, &s);
    if(rv) {
        printf("%s\n", DATA_FILENAME);
        error("Could not fstat file!");
    }
    DATA_SIZE = s.st_size;

    char* filename = (char*)calloc(1, 100);

    // initially, each run is just 1 page, so this is how many runs there are
    int run_length = 1;
    unsigned long num_runs = DATA_SIZE / PAGE_SIZE;
    if(DATA_SIZE % PAGE_SIZE) {
        num_runs++;
    }

    printf("Sorting %lu ints (%.02f MB)...\n", 
            num_runs*INTS_SIZE,
            (double)num_runs*PAGE_SIZE / (1<<20));

    char input_prefix[] = "foo_";
    char output_prefix[] = "bar_";

    long start_usecs = get_time_usecs();

    while(num_runs > 1) {
        printf("Iterate: %lu runs left\n", num_runs);
        // Number of ways we can merge at once
        unsigned long num_ways = ((BUF_SIZE / PAGE_SIZE) / IO_BUF_PAGES) - 1;
        // Case for small sorts: fits in memory
        if(num_ways > num_runs) {
            num_ways = num_runs;
        }
        // Calculate how many multimerges need to be done to merge all runs
        unsigned long num_merges = num_runs / num_ways;
        if(num_runs % num_ways) {
            num_merges++;
        }

        unsigned long run_counter = 0;

        // Start iterating the multimerges
        for(int i=0; i<num_merges; i++) {
            printf("Merge %d of %lu (%lu ways)\n", i+1, num_merges, num_ways);
            // Normal case: merge n-ways
            unsigned long num_runs_in_merge = num_ways;
            // Remainder case: merge num_runs % num_ways
            if(run_counter >= (num_runs/num_ways)*num_ways) {
                num_runs_in_merge = num_runs % num_ways;
            }

            // Allocate the runs and runfds we're using for this multimerge
            run_t** runs = (run_t**)calloc(num_runs_in_merge, sizeof(run_t*));

            // Handle the first pass differently, since the numbers
            // are all coming out of the same file, we need to populate
            // the runs here instead of letting multimerge do it
            if(run_length == 1) {
                // Allocate different sized runs
                PRINTF("Base case: init and sorting pages\n");
                for(int i=0; i<num_runs_in_merge; i++) {
                    runs[i] = (run_t*)calloc(1, sizeof(run_t));
                    runs[i]->items = (int*)calloc(1, PAGE_SIZE);
                    int bytes = read(sort_fd, runs[i]->items, PAGE_SIZE);
                    runs[i]->length = bytes/sizeof(int);
                    if(bytes == 0) {
                        break;
                    }
                }
                run_counter += num_runs_in_merge;

                PRINTF("Doing parallel qsort...");
                parallel_qsort(runs, num_runs_in_merge);
                PRINTF("done!\n");
                // Verify the quicksort
#ifdef DEBUG
                PRINTF("Verifying qsort results...");
                for(int i=0; i<num_runs_in_merge; i++) {
                    int temp = 0;
                    int init = 0;
                    for(int j=0; j<runs[i]->length; j++) {
                        if(!init) {
                            temp = runs[i]->items[j];
                            init = 1;
                        }

                        if(runs[i]->items[j] < temp) {
                            printf("Incorrect qsort! Run %d idx %d\n",
                                    i, j);
                            exit(-1);
                        }
                        temp = runs[i]->items[j];
                    }
                }
                PRINTF("correct!\n");
#endif
            }
            else {
                for(int i=0; i<num_runs_in_merge; i++) {
                    snprintf(filename, 100, "%s%lu.dat", input_prefix, run_counter);
                    PRINTF("Reading in: %s\n", filename);
                    run_counter++;
                }
            }

            snprintf(filename, 100, "%s%d.dat", output_prefix, i);
            PRINTF("Writing out: %s\n", filename);
            int output_fd = open(filename, O_CREAT|O_WRONLY|O_TRUNC, 
                    S_IRWXU|S_IRWXG);
            // Merge them together
            int base = 0;
            if(run_length == 1) {
                base = 1;
            }
            multimerge(runs, input_prefix, num_runs_in_merge, output_fd, base);

            close(output_fd);

            // Close and free runs / fds
            // Free the allocated runs if base case
            if(base == 1) {
                for(int i=0; i<num_runs_in_merge; i++) {
                    free(runs[i]->items);
                    free(runs[i]);
                }
            }
            free(runs);

        }
        // Swap the input and output prefixes
        // This way the next merge uses the previous round's output
        // as input
        strcpy(filename, output_prefix);
        strcpy(output_prefix, input_prefix);
        strcpy(input_prefix, filename);

        // RUN_LENGTH increases by N_WAYS every iteration
        run_length *= num_ways;
        // Number of runs produced is one per merge, i.e. num_merges
        num_runs = num_merges;
    }

    long end_usecs = get_time_usecs();
    double secs = (double)(end_usecs - start_usecs) / (double)1000000;
    printf("Done sorting.\n");
    printf("Sorting took %.02f seconds.\n", secs);
#ifdef DEBUG
    snprintf(filename, 100, "%s%d.dat", input_prefix, 0);
    verify(filename);
#endif

    free(filename);

    return 0;
}