Exemplo n.º 1
void get_top_n_keys_from_wc(key_value *kv, int get_n, char ***list, int *list_n, int **counts)
	int kv_size = get_n_kv_from_kv(kv);
	int out_n = min(get_n, kv_size), loop = 0;

	if (out_n == 0)
		*list = NULL;
		*list_n = 0;

	sort_kv(kv, FALSE, FALSE, value_cmp_int);

	*list = (char **)malloc(out_n * sizeof(char *));
	*counts = (int *)malloc(out_n * sizeof(int));

	for(loop=0; loop<out_n; loop++)
		const char *k = get_key_by_index(kv, loop);
		int v = *(int *)get_value_by_index(kv, loop);

		(*list)[loop] = strdup(k);
		(*counts)[loop] = v;

	*list_n = out_n;

	sort_kv(kv, TRUE, TRUE, NULL);
Exemplo n.º 2
    struct sockaddr_in local;
    struct sockaddr_in remote;
    int sk, len = sizeof(local), worker_sk, reducers_sk[N], rc;
    struct hostent *hp, *gethostbyname();

    char buf[BUFSIZ];

    // store reducer address and port
    char mapper_num[10];
    char split_name[20];
    char origin_file_name[100];
    char count_result_name[30];
    char reducer_str[N][BUFSIZ];
    char reducer_addr[N][128];
    char reducer_port[N][10];
    char mapper_sent_str[BUFSIZ];
    pthread_t threads[NUM_THREADS];
    char cmd[200];

    /* Create an internet domain stream socket */
    sk = socket(AF_INET, SOCK_STREAM, 0);

    /*Construct and bind the name using default values*/
    local.sin_family = AF_INET;
    local.sin_addr.s_addr = INADDR_ANY;
    local.sin_port = 0;
    bind(sk, (struct sockaddr *)&local, sizeof(local));

    /*Find out and publish socket name */
    getsockname(sk, (struct sockaddr *)&local, &len);
    printf("Socket has port %d\n", local.sin_port);

    operate_ns("register", "mapper", "localhost", local.sin_port);

    /* Start accepting connections */
    /*Declare willingness to accept a connection*/
    listen(sk, 5);

    while (1)
        worker_sk = accept(sk, 0, 0); /*Accept new request for a connection*/

        // TODO: receive the file and store it on helper local disk
        //       Do word count and combine the result.

        read(worker_sk, buf, BUFSIZ);
        printf("%s", buf);

        int i = 0, j = 0, k = 0, l = 0;
        char c;

        // get the mapper number to send to reducer, 
        // in order to let reducer to know how many 
        // seperated file should be received;
        while ((c = buf[i++]) != '\n')
            mapper_num[j++] = c;
        mapper_num[j] = '\0';
        j = 0;

        // get the split file name;
        while ((c = buf[i++]) != '\n')
            split_name[j++] = c;
        split_name[j] = '\0';
        j = 0;

        // get the original file name;
        while ((c = buf[i++]) != '\n')
            origin_file_name[j++] = c;
        origin_file_name[j] = '\0';
        j = 0;

        // parse reducer address
        while ((c = buf[i++]) != '\0')
            if (c != '\n')
                reducer_str[k][j++] = c;
            if (j < 10) break; // in case non-vaild address string
            reducer_str[k][j] = '\0';

            get_value_by_index(reducer_addr[k], reducer_str[k], 1);
            get_value_by_index(reducer_port[k], reducer_str[k], 2);

            j = 0;

        reducer_num = k; // set up reducer number global variable to enable thread communcation

        printf("Origin Name: %s\n", origin_file_name);
        printf("Split Name: %s\n", split_name);

        /*  Count word using command line tool
        /* 1) tr:   turn space to newline
        /* 2) tr:   turn upper case to lower case
        /* 3) sed:  get rid of char besides a-z
        /* 4) sort: sort by char
        /* 5) uniq: count unique line
        /* 6) awk:  print out to file

        sprintf(cmd, "(tr ' ' '\\n' | tr '[:upper:]' '[:lower:]' | sed -e 's/[^a-z]//g' | sort | uniq -c | awk '{print $2\" \"$1}') < ./tmp/%s > ./tmp/%s_count ", split_name, split_name);
        //printf("%s\n", cmd);

        // TODO:
        // seperate the word count result by number of reducers (k)
        // then send it each seperate to each reducer.
        // use the alphabetical to split reducer's task (namely shuffle)

        int step = (25 + k) / k; // round up
        j = 0, l = 0;
        char *alpha = "abcdefghijklmnopqrstuvwxyz";

        for (i = 0; i < 26; i++)
        	if (i == 0) continue;
            if ((i + 1) % step == 0 || (i + 1) == 26)
                sprintf(count_result_name, "./tmp/%s_count", split_name);
                seperate_file(count_result_name, alpha[j], alpha[i], l++);
                j = i + 1;

        // shuffle to reducers
        for (i = 0; i < k; i++)
            printf("Reducer Address: %s, Port: %s\n", reducer_addr[i], reducer_port[i]);

            thread_data_array[i].mapper_num = mapper_num;
            thread_data_array[i].split_name = split_name;
            thread_data_array[i].origin_file_name = origin_file_name;
            thread_data_array[i].reducer_addr = reducer_addr[i];
            thread_data_array[i].reducer_port = reducer_port[i];
            thread_data_array[i].i = i;
            thread_data_array[i].worker_sk = worker_sk;

	        rc = pthread_create(&threads[i], NULL, send_work_to_reducer_and_listen,
	                            (void *) &thread_data_array[i]);
	        if (rc)
                printf("ERROR; return code from pthread_create() is %d\n", rc);
            printf("thread: %d running\n", i);

        // TODO: HOW TO Communicate with Reducers?
        // SOLVED: use multi-thread to listening connection to those reducers
        // and a global variable to determine whether if all reducers finished.

Exemplo n.º 3
void send_work_to_reducer_and_listen(void *threadarg)
	// variable for socket
    struct sockaddr_in local;
    struct sockaddr_in remote;
    int reducer_sk;
    struct hostent *hp, *gethostbyname();
    char buf[BUFSIZ];

    // variable for thread argument passing
    struct thread_data *my_data;
    my_data = (struct thread_data *) threadarg;

    char mapper_num[10];
    char split_name[20];
    char origin_file_name[100];
    char reducer_addr[128];
    char reducer_port[10];
    int i;
    int worker_sk;

    sprintf(mapper_num, "%s", my_data->mapper_num);
    sprintf(split_name, "%s", my_data->split_name);
    sprintf(origin_file_name, "%s", my_data->origin_file_name);
    sprintf(reducer_addr, "%s", my_data->reducer_addr);
    sprintf(reducer_port, "%s", my_data->reducer_port);
    i = my_data->i;
    worker_sk = my_data->worker_sk;

    char reducer_sent_str[BUFSIZ];

    reducer_sk = socket(AF_INET, SOCK_STREAM, 0);
    remote.sin_family = AF_INET;

    hp = gethostbyname(reducer_addr);
    printf("address: %s\n", reducer_addr);

    bcopy(hp->h_addr, (char *)&remote.sin_addr, hp->h_length);
    remote.sin_port = atoi(reducer_port);
    printf("port: %s\n", reducer_port);

    connect(reducer_sk, (struct sockaddr *)&remote, sizeof(remote));
    sprintf(reducer_sent_str, "%d\n%s\n%s\n%s_count_%d\n"
    	, i 				// current reducer index
    	, mapper_num		// mapper number
    	, origin_file_name	// original file name
    	, split_name		// filename: split file that reduce should work on
    	, i);				// filename: index of current split

    printf("%s\n", reducer_sent_str);

    write(reducer_sk, reducer_sent_str, strlen(reducer_sent_str) + 1);

    // listening to reducers
	char status[10];

    read(reducer_sk, buf, BUFSIZ);
    get_value_by_index(status, buf, 0);
    if (strcmp(status, "done") == 0) {
    	printf("one reducer done!\n");
    if (number_of_reducer_done >= reducer_num) {
    	printf("All reducers done!\n");
    	write(worker_sk, "status:done|type:mapper", BUFSIZ);
        number_of_reducer_done = 0;

    // TODO: Keep trace the helper and make sure they are
    //       not dead. But HOW?