static void
test_count_features( void **state ) {
  UNUSED( state );
  
  instructions_capabilities *ins_cap = ( instructions_capabilities * ) xcalloc( 1, sizeof( *ins_cap ) );
  uint16_t feature_len;
  
  feature_len = count_features( ( void * ) ins_cap, sizeof( *ins_cap ) );
  assert_int_equal( feature_len, 0 );
  
  // test setting all the attributes to true
  memset( ins_cap, 1, sizeof( *ins_cap ) );
  feature_len = count_features( ( void * ) ins_cap, sizeof( *ins_cap ) );
  assert_int_equal( feature_len, 6 );
  
  // test setting the first and last attribute
  memset( ins_cap, 0, sizeof( *ins_cap ) );
  ins_cap->meter = true;
  ins_cap->goto_table = true;
  feature_len = count_features( ( void * ) ins_cap, sizeof( *ins_cap ) );
  assert_int_equal( feature_len, 2 );
  
  // test some attributes found around the middle of the structure.
  memset( ins_cap, 0, sizeof( *ins_cap ) );
  ins_cap->clear_actions = true;
  ins_cap->write_actions = true;
  ins_cap->write_metadata = true;
  feature_len = count_features( ( void * ) ins_cap, sizeof( *ins_cap ) );
  assert_int_equal( feature_len, 3 );
  xfree( ins_cap );
  
}
Beispiel #2
0
//counts total number of features
int count_features(feature_tree *root){
	int count = 0;
	if (root == NULL){
		return 0;
	}
	count += 1 + count_features(root->left) + count_features(root->right);
	return count;
}
static void
test_assign_action_ids( void **state ) {
  UNUSED( state );

  actions_capabilities *ac_cap = ( actions_capabilities * ) xmalloc( sizeof( *ac_cap ) );
  uint16_t feature_len;

  memset( ac_cap, 1, sizeof( *ac_cap ) );
  ac_cap->drop = false;
  feature_len = count_features( ( void * ) ac_cap, sizeof( *ac_cap ) );
  struct ofp_action_header *ac_hdr = ( struct ofp_action_header * ) xmalloc( feature_len * sizeof( *ac_hdr ) );
  uint16_t total_len = assign_action_ids( ac_hdr, ac_cap );
  for ( uint16_t i = 0; i < feature_len; i++ ) {
    assert_action_ids( &ac_hdr[ i ], ac_cap );
  }
  assert_int_equal( total_len, feature_len * sizeof( struct ofp_action_header ) );
  xfree( ac_hdr );
  xfree( ac_cap );
}
static void
test_assign_instruction_ids( void **state ) {
  UNUSED( state );
  
  instructions_capabilities *ins_cap = ( instructions_capabilities * ) xmalloc( sizeof( *ins_cap ) );
  uint16_t feature_len;
  
  // test setting all the instructions
  memset( ins_cap, 1, sizeof( *ins_cap ) );
  
  feature_len = count_features( ( void * ) ins_cap, sizeof( *ins_cap ) );
  // allocate space for all ofp_instruction
  struct ofp_instruction *instructions = ( struct ofp_instruction * ) xmalloc( feature_len * sizeof( *instructions ) );
  uint16_t total_len = assign_instruction_ids( instructions, ins_cap );
  for ( uint16_t i = 0; i < feature_len; i++) {
    assert_instruction( &instructions[ i ], ins_cap );
  }
  assert_int_equal( total_len, feature_len * sizeof( struct ofp_instruction ) );
  xfree( instructions );
  xfree( ins_cap );
}
Beispiel #5
0
int main (int argc, char **argv)
{
    struct arguments arguments;

    /* Parse our arguments; every option seen by parse_opt will
       be reflected in arguments. */
    argp_parse (&argp, argc, argv, 0, 0, &arguments); 

    // number of nearest neighbors
    int k;
    k = 1; //default is 1
    if (sscanf (arguments.args[0], "%i", &k)!=1) {}

    //omp vars
    int num_threads;
    num_threads = 4;
    if (sscanf(arguments.args[1], "%i", &num_threads)!=1) {}

    //verbose?
    int verbose;
    verbose = arguments.verbose;
    if (verbose>0 && verbose<130){
        verbose = 1;
    }
    else{
        verbose = 0;
    }


    //define a bunch of counters!
    int i, j, m, n, ii, jj, kk;

    //number of examples to read in
    int total_examples = 10000;
    // int total_examples = 19;

    //max words per question
    int num_words = 300;

    //max word length
    int max_word_len = 20;
    //max vocab count
    // int max_vocab = 200000;

    //data read in poorly
    int bad_iter = 0;

    //Used to split into training and testing data (will train on example_num%train)
    int train = 10;

    //Debug
    int debug = 0;


    printf("k, Verbose, num_threads = %i, %i, %i\n",
                          k, verbose, num_threads);
    

     //Allocate space for data being read in with fgets
    char *csv_line = malloc(sizeof(char)*1500);

    //store all data
    //array of structs
    //struct.question->array of char*
    //struct.cat->char*
    //struct.example_num->int
    struct data *all_data;
    all_data = malloc(sizeof(struct data)*total_examples);
    for (ii=0; ii<total_examples; ii++){
    	all_data[ii].question = malloc(sizeof(char*)*num_words);
    	for (jj=0; jj<num_words; jj++){
    		// all_data[ii].question[jj] = malloc(sizeof(char)*max_word_len);
    		all_data[ii].question[jj] = calloc(max_word_len, sizeof(char));
    	}
    	all_data[ii].cat = malloc(sizeof(char)*max_word_len);
    }

    //store numeric version of data for algorithms
    struct numeric_data *num_data;
    num_data = malloc(sizeof(struct numeric_data)*total_examples);
    for (ii=0; ii<total_examples; ii++){
    	num_data[ii].array_of_features = malloc(sizeof(struct feature_count)*num_words);
    	for (jj=0; jj<num_words; jj++){
    		num_data[ii].array_of_features[jj].feature_num = 0;
    		num_data[ii].array_of_features[jj].count = 0;
    	}
    }

    //store struct which keep track of the k nearest neighbors
    // struct distance_results results;
    // results.example_num = 0;
    // results.distances = calloc(k, sizeof(double));
    // results.cat = calloc(k, sizeof(int));
    // results.example_nums = calloc(k, sizeof(int));

    // //struct used to calculate the mode of the k nearest neighbors
    // struct mode mod;
    // mod.count = calloc(k, sizeof(int));
    // mod.cat = calloc(k, sizeof(int));
 
    // //store vocabulary list (char** points to array of char* of length 20)
    // char **word_list;
    // word_list = malloc(sizeof(char*)*max_vocab); //assumes max_vocab total vocab
    // for (ii=0; ii<max_vocab; ii++){
    // 	// word_list[ii] = malloc(sizeof(char)*max_word_len);  //assumes max word length of 20
    // 	word_list[ii] = calloc(max_word_len, sizeof(char));  //assumes max word length of 20
    // }

    //alternate vocab store tree
    feature_tree *vocab;
    vocab = NULL;

    //store category list
    char **cat_list;
    cat_list = malloc(sizeof(char*)*40);  //assumes 20 max categories
    for (ii=0; ii<40; ii++){
    	cat_list[ii] = malloc(sizeof(char)*max_word_len);
		strncpy(cat_list[ii], "\0", 1);	
    }

    //Read in csv file
    FILE *f = fopen("train_pruned2.csv", "r");
    if (f == NULL){
    	printf("Failed to open file \n");
    	return -1;
    }

    //parse question into individual words, create vocabulary list
    int vocab_count = 0;
    int category_count = 1;

    for (i=0; i<total_examples; i++){
    	// printf("Iteration = %i\n", i);

    	//line in csv to buffer
    	if (fgets(csv_line, 1500, f) == NULL){
            printf("Fgets error!\n");
            exit(0);
        }

    	//csv line to 3 individual parts
    	if (i>0)
    	{

			char *tok;
			char *tok_copy; //problem with tok getting overwritten in parse_question
			// char **parsed_question = malloc(sizeof(char*)*num_words);

			// printf("CSV_LINE = %s\n", csv_line);

			tok = strtok(csv_line, "|");
			if (tok == NULL){
				// all_data[i-bad_iter-1].example_num = -1;
				bad_iter++;
				// i--;
				continue;
			}
			sscanf(tok, "%i", &all_data[i-bad_iter-1].example_num);

			

			tok = strtok(NULL, "|");
			if (tok == NULL){
				// all_data[i-bad_iter-1].example_num = -1;
				bad_iter++;
				// i--;
				continue;
			}
			tok_copy = (char *)tok;

			

			tok = strtok(NULL, "|");
			if (tok == NULL){
				// all_data[i-bad_iter-1].example_num = -1;
				bad_iter++;
				// i--;
				continue;
			}
			strncpy(all_data[i-bad_iter-1].cat, tok, 19);
			all_data[i-bad_iter-1].cat[max_word_len-1] = 0;


			char *tok2;
			tok2 = strtok(tok_copy, " \t");

			j = 0;
			if ((tok2 != NULL) && (strlen(tok2)>3)){
				strncpy(all_data[i-bad_iter-1].question[0], tok2, 19);
		    	all_data[i-bad_iter-1].question[0][max_word_len-1] = 0;

		    	//add to tree if not test data
		    	// if (all_data[i-bad_iter-1].example_num % train != 0){
		    	insert_word(&vocab, all_data[i-bad_iter-1].question[0]);
    			j += 1;
		    	// }
    			
			}

			while (tok2 != NULL){
				if (j>=num_words){
					break;
				}
				tok2 = strtok(NULL, " \t");
		        if ((tok2 != NULL) && (strlen(tok2)>3)){
		            strncpy(all_data[i-bad_iter-1].question[j], tok2, 19);
		            all_data[i-bad_iter-1].question[j][max_word_len-1] = 0;

		            //add to tree if not test data
		            // if (all_data[i-bad_iter-1].example_num % train != 0){
					insert_word(&vocab, all_data[i-bad_iter-1].question[j]);
					j++;
    				// }
		        }
				
			} //end while

    		// all_data[i-bad_iter-1] = instance;
    		// print_data(&all_data[i-bad_iter-1]);

    		////add to vocabulary (using array, VERY slow with lots of data)
    		// add_to_word_list(all_data[i-bad_iter-1].question, word_list, &vocab_count);
    		
    		//add to category list
    		add_to_cat_list(all_data[i-bad_iter-1].cat, cat_list, &category_count);

    	} //end if
    } //end for

    //close file
    fclose(f);

    //assign unique number to each feature
    //first feature is feature 1, feature 0 is for errors etc.
    unsigned int mm = 1;
    number_features(vocab, &mm);

    //Some of the csv rows aren't read in properly with fgets
    printf("Bad iterations = %i/%i\n", bad_iter, i);
    printf("Feature count = %i\n", count_features(vocab));
    // print_inorder(vocab);

    // for (ii=0; ii<40; ii++){
    // 	printf("%s", cat_list[ii]);
    // }

    ////turn data into numeric features////
    for (i=0; i<total_examples; i++){
    	num_data[i].example_num = all_data[i].example_num;
    	num_data[i].cat = get_cat_index(cat_list, all_data[i].cat);
    	words_to_num(&num_data[i], &all_data[i], &vocab, num_words);
        // count_features2(&num_data[i]);
    }

    // num_data->array_of_features[0].feature_num = 44;

    // print_num_data(&num_data[0]);
    // print_num_data(&num_data[1]);
    total_examples = total_examples-bad_iter-1;

    int sadfjh;
    double av_feature_count = 0;
    for (ii=0; ii<total_examples; ii++){
        sadfjh = count_features2(&num_data[ii]);
        av_feature_count += sadfjh;
        // printf("%i ", sadfjh);
    }
    // printf("\n av_feature_count %f\n", av_feature_count/(total_examples-bad_iter-1));
    
    
    // print_num_data(&num_data[4464]);

    // printf("vocab->right = %s \n", vocab->feature);
    // print_data(&all_data[0]);
    // print_data(&all_data[29000]);
    // printf("%s, %u\n", "1829", get_feature_number(&vocab, "1829"));



    //find the distance between first example and rest
    double distance;

    //range each process will cover
    int range;

    // printf("%i, %i\n", range, total_examples);
    // printf("R, Min, Max = %i, %i, %i\n", rank, rank*range, (rank+1)*range);

    //     struct distance_results results;
    // results.example_num = 0;
    // results.distances = calloc(k, sizeof(double));
    // results.cat = calloc(k, sizeof(int));
    // results.example_nums = calloc(k, sizeof(int));

    // //struct used to calculate the mode of the k nearest neighbors
    // struct mode mod;
    // mod.count = calloc(k, sizeof(int));
    // mod.cat = calloc(k, sizeof(int));


   	//correct/total/answer
   	int c = 0;
    int total = 0;
    int answer;

    omp_set_dynamic(0); //Explicitly disable dynamic teams
    omp_set_num_threads(num_threads); //Specify thread count

    #pragma omp parallel \
            private(kk, ii, distance, answer) \
            reduction(+:c,total) \
            shared(num_data)
    {
        //store struct which keep track of the k nearest neighbors
        struct distance_results results;
        results.example_num = 0;
        results.distances = calloc(k, sizeof(double));
        results.cat = calloc(k, sizeof(int));
        results.example_nums = calloc(k, sizeof(int));

        //struct used to calculate the mode of the k nearest neighbors
        struct mode mod;
        mod.count = calloc(k, sizeof(int));
        mod.cat = calloc(k, sizeof(int));
    
        #pragma omp for
        for (kk=0; kk<total_examples; kk++){
            // printf("Thread = %i, Iter = %i, c = %i, total=%i\n", omp_get_thread_num(), kk, c, total);

        	//only test on test data
        	if (num_data[kk].example_num%train != 0){
        		continue;
        	}

        	if (num_data[kk].cat == 0){
        		continue;
        	}

        	results.correct_answer = num_data[kk].cat;
        	results.example_num = num_data[kk].example_num;
        	for (ii=0; ii<k; ii++){
        		results.distances[ii] = 0;
        		results.cat[ii] = 0;
        		mod.count[ii] = 0;
        		mod.cat[ii] = 0;
        	}

        	// print_num_data(&num_data[kk]);

        	//calc distance to neighbors
        	for (ii=0; ii<total_examples-1; ii++){
        		//don't calc distance to self
        		if (kk != ii){
                    //Eliminate bad data (examples with few words tend to have low distances
                    //reguardless of whether they are more similar...
                    if (num_data[ii].total_features >= 40){
                        distance = get_distance(&num_data[kk], &num_data[ii], num_words);
                        // if (distance < 2){
                        //  continue;
                        // }
                        // printf("%f ", distance);
                        if (num_data[ii].example_num > 0){
                            add_distance_to_results(&results, distance, k, 
                                                    num_data[ii].cat, num_data[ii].example_num);
                        }
    		    	}
        		}
    	    	
    	    }

    	    answer = calc_nearest_neighbor(&results, &mod, k);
    	    if (answer == results.correct_answer){
    	    	c += 1;
    	    }
    	    // printf("\n");
    	    // for (ii=0; ii<k; ii++){
    	    // 	printf("Distance, cat, example_num1, example_num2 = %2.2f, %i, %i, %i\n", 
    	    // 		results.distances[ii], results.cat[ii], results.example_num, results.example_nums[ii]);
    	    // }
    	    // else{
    	    	
    	    // }
    	    total += 1;

            if (verbose>0 && debug>0){
                printf("Thread = %i, Correct/Total = %i/%i  Answer/Correct = %i/%i\n", 
                    omp_get_thread_num(), c, total, answer, results.correct_answer);
            }
    	    
        }

        //Thread results
        #pragma omp barrier
        if (omp_get_thread_num() == 0){
            printf("/// Thread Results ///\n");
        }
        #pragma omp barrier
        printf("Thread = %i, Correct/Total = %i/%i\n", 
                omp_get_thread_num(), c, total);

        //free distance result
        free(results.distances);
        free(results.cat); 

        //free mode struct
        free(mod.count);
        free(mod.cat);
    }

    printf("/// Final Results ///\n");
    printf("Correct/Total = %i/%i\n", c, total);
    // printf("verbose = %i", verbose);


    

    ////free malloc calls////
    //free feature tree
    free_feature_tree(vocab);

    //free numeric data
    for (ii=0; ii<total_examples; ii++){
    	free(num_data[ii].array_of_features);
    }
    free(num_data);

    // //free vocab list
    // for (ii=0; ii<max_vocab; ii++){
    //     free(word_list[ii]);  
    // }
    // free(word_list);

    //free category list
    for (ii=0; ii<40; ii++){
        free(cat_list[ii]);  
    }
    free(cat_list);

    //free all_data list
    for (ii=0; ii<total_examples; ii++){
        
        for (jj=0; jj<num_words; jj++){
            free(all_data[ii].question[jj]);
        }
        free(all_data[ii].question);
        free(all_data[ii].cat);
    }
    free(all_data);

    //free var used to rean in csv
    free(csv_line);

    
}