Example #1
0
int EludeCaller::Run() {
  pair<int, double> best_model(-1, -1.0);
  if (!train_file_.empty() && !load_model_file_.empty() && VERB >= 4
      && !linear_calibration_) {
    cerr << "Warning: a model can be either trained or loaded from a file. "
         << "The two options should not be used together, unless linear calibration "
         << "should be carried out. In such a case please use the -j option. "
         << "The model will be trained using the peptides in " << train_file_ << endl;
  }
  // train a retention model
  if (!train_file_.empty()) {
    ProcessTrainData();
    // initialize the feature table
    train_features_table_ = DataManager::InitFeatureTable(
         RetentionFeatures::kMaxNumberFeatures, train_psms_);
    if (automatic_model_sel_) {
      best_model = AutomaticModelSelection();
    } else if (only_hydrophobicity_index_) {
    	map<string, double> custom_hydrophobicity_index = TrainRetentionIndex();
		  if (!index_file_.empty()) {
			  SaveRetentionIndexToFile(index_file_, custom_hydrophobicity_index);
		  } else {
			  PrintHydrophobicityIndex(custom_hydrophobicity_index);
		  }
    	cerr << "Now I saved the index" << endl;
    	return 0;
    } else if (load_model_file_.empty()) {
      TrainRetentionModel();
    }
  } else if (automatic_model_sel_) {
    if (!test_file_.empty()) {
      ProcessTestData();
      processed_test_ = true;
    }
    best_model = AutomaticModelSelection();
  }

  // load a model from a file
  if (!load_model_file_.empty() && !automatic_model_sel_) {
    rt_model_ = new RetentionModel(the_normalizer_);
    rt_model_->LoadModelFromFile(load_model_file_);
  }
  // save the model
  if (!save_model_file_.empty()) {
    if (rt_model_ != NULL && !rt_model_->IsModelNull()) {
      rt_model_->SaveModelToFile(save_model_file_);
    } else if (VERB >= 2) {
      cerr << "Warning: No trained model available. Nothing to save to "
           << save_model_file_ << endl;
    }
  }
  // append a file to the library
  if (append_model_) {
    if (automatic_model_sel_) {
      if (VERB >= 3) {
        cerr << "Warning: The model should already be in the library if "
             << "the automatic model selection option is employed. No model "
             << "will be appended to the library"<< endl;
      }
    } else if (rt_model_ == NULL) {
      if (VERB >= 3) {
        cerr << "Warning: No model available, nothing to append to the library."
             << endl;
      }
    } else {
      AddModelLibrary();
    }
  }
  // save the retention index to a file
  if (!index_file_.empty()) {
    SaveIndexToFile(best_model.first);
  }
  // test a model
  if (!test_file_.empty()) {
    // process the test data
    if (!processed_test_) {
      ProcessTestData();
    }
    if (test_psms_.size() <= 0) {
      if (VERB >= 3) {
        cerr << "Warning: no test psms available, nothing to do. " << endl;
        return 0;
      }
    }
    // initialize the feature table
    test_features_table_ = DataManager::InitFeatureTable(
            RetentionFeatures::kMaxNumberFeatures, test_psms_);
    int ret = 1;
    if (automatic_model_sel_) {
      int index = best_model.first;
      if (index < 0) {
        if (VERB >= 2) {
          cerr << "Error: No model available to predict rt. Execution aborted." << endl;
        }
        return 0;
      }
      rt_models_[index]->PredictRT(test_aa_alphabet_, ignore_ptms_, "test psms",
          test_psms_);
      if (linear_calibration_ && train_psms_.size() > 1) {
        rt_models_[index]->PredictRT(train_aa_alphabet_, ignore_ptms_, "calibration psms",
            train_psms_);
      }
    } else {
      int ret = rt_model_->PredictRT(test_aa_alphabet_, ignore_ptms_, "test psms",
          test_psms_);
      if (ret != 0) {
        if (VERB >= 2) {
          cerr << "Error: the amino acids alphabet in the test data does not match "
               <<"the ones used to train the model. Please use the -p option to ignore the ptms "
               <<"in the test data data are were not present in the training set " << endl;
        }
        return 0;
      }
      if (linear_calibration_ && train_psms_.size() > 1) {
        ret = rt_model_->PredictRT(train_aa_alphabet_, ignore_ptms_, "training psms",
            train_psms_);
        if (ret != 0) {
          if (VERB >= 2) {
          cerr << "Error: the amino acids alphabet in training data does not match "
               <<"the one used to train the model. Please use the -p option to ignore the ptms "
               <<"that were not present in the set used to train the model " << endl;
          }
          return 0;
        }
      }
    }
    // linear calibration is performed only for automatic model selection or when
    // loading a model from a file
    if (linear_calibration_ && (automatic_model_sel_ || (!load_model_file_.empty() &&
        train_psms_.size() >= 2))) {
      if (train_psms_.size() <= 1 && !automatic_model_sel_) {
        if (VERB >= 3) {
          cerr << "Warning: at least 2 training psms are needed to calibrate the model. "
               << "No calibration performed. " << endl;
         }
       } else {
         // get the a and b coefficients
         if (linear_calibration_ && train_psms_.size() < 2) {
           if (VERB >= 4) {
             cerr << "Warning: No (enough) calibration peptides. Linear calibration "
                  << "cannot be performed " << endl;
           }
         } else {
           pair<vector<double> , vector<double> > rts = GetRTs(train_psms_);
           lts = new LTSRegression();
           lts->setData(rts.first, rts.second);
           lts->runLTS();
           AdjustLinearly(test_psms_);
         }
       }
    }
    // compute performance measures
    if (test_includes_rt_) {
      double rank_correl = ComputeRankCorrelation(test_psms_);
      double pearson_correl = ComputePearsonCorrelation(test_psms_);
      double win = ComputeWindow(test_psms_);
      if (VERB >= 3) {
        cerr << "Performance measures for the test data: " << endl;
        cerr << "  Pearson's correlation r = " << pearson_correl << endl;
        cerr << "  Spearman's rank correlation rho = " << rank_correl << endl;
        cerr << "  Delta_t 95% = " << win << endl;
      }
    }
    // write the predictions to file
    if (!output_file_.empty()) {
      DataManager::WriteOutFile(output_file_, test_psms_, test_includes_rt_);
    } else {
      if (VERB >= 2 && !supress_print_) {
        PrintPredictions(test_psms_);
      }
    }
  }
  return 0;
}
Example #2
0
int main(int argc, char* argv[]) {
	//check argument number
	if (argc < 3 || argc > 4) {
		printf("too many or too little arguments, please try again");
		exit(0);
	}
	
	//check directory validity
	if (!IsDir(argv[1])) {
		printf("invalid directory, please try again");
		exit(0);
	}
	
	//Initialize variables and index
	int docId;
	int pos;
	char *doc;
	char **filenames = NULL;
	int num_files = 0;
	HashTable *WordsFound = calloc(1, sizeof(HashTable));
	num_files = GetFilenamesInDir(argv[1], &filenames);

	//check whether the folder has files
	if (num_files < 0) {
		printf("failed to get any filenames");
		exit(0);
	}

	//iterate through each file in the directory
	for (int i = 0; i < num_files; i++) {
		
		//check that the file is in the correct format (title is a number)
		int filechecker = 0;
		for (int c = 0; c < strlen(filenames[i]); c++) {
			if (!isdigit(filenames[i][c])) {
				filechecker = 1;
			}
		}
		if (filechecker == 1) {
			continue;
		}

		//Load the document
		char *word;
		char file[100];
		strcpy(file, argv[1]);
		strcat(file, filenames[i]);
		doc = LoadDocument(file);
		docId = GetDocumentId(filenames[i]);
		free(filenames[i]);
		
		pos = 0;
		//Iterate through each word in the html file (doc)
		while ((pos = GetNextWord(doc, pos, &word)) > 0) {
			NormalizeWord(word);
			if (InHashTable(word, WordsFound) == 0) {
				AddToHashTable(word, WordsFound);
				UpdateHashTable(word, docId, WordsFound);
			}
			else {
				UpdateHashTable(word, docId, WordsFound);
				free(word);
			}
		}
		free(doc);
	}	
	free(filenames);
	SaveIndexToFile(argv[2], WordsFound);				//Save the index to the file specified
	FreeHashTable(WordsFound);

	//only proceed if there was a third argument specified. If so, reload the index form the file you just created
	if (argc == 4) {
		HashTable *ReloadedIndex = ReadFile(argv[2]);
		SaveIndexToFile(argv[3], ReloadedIndex);
		FreeHashTable(ReloadedIndex);
	}
	return 0;
}
Example #3
0
int main (int argc, char **argv) {

	/* Check Arguments */
	if (!CheckArguments(argc, argv)) {
		exit(-1);
	}

	/* Make variables for all things needed for indexer and indexer testing */
	char *page_directory;
	char *index_filename;
	char *read_index_filename;
	char *new_index_filename;
	// If argument count is 3 initialize only 2 variables else initialize all
	page_directory = argv[1];
	index_filename = argv[2];

	// Initialize hashtable, word node, and document node
	HashTable *index_hashtable = calloc(1, sizeof(HashTable));

	/*Make array to hold filenames (just document numbers) and use GetFilenamesInDir to grab all names */
	char **filename_array;
	int number_of_files;
	if ((number_of_files = GetFilenamesInDir(page_directory, &filename_array)) < 0) {
		fprintf(stderr, "Could not get filenames in page directory. Exiting Now.\n");
		exit(-1);
	}


	/* Add page_directory to the front of the filenames */
	for (int i = 0; i < number_of_files; i++) {
		// Make pointe to current string in filename_array
		char *previous_string = filename_array[i];
		// Get length of full string and initialize element of filename_array to that size
		int len = strlen(page_directory) + strlen(previous_string) + 1;
		char *new_string = calloc(len, sizeof(char));
		// Make new string and free previous string
		strcpy(new_string, page_directory);
		strcat(new_string, previous_string);
		if (previous_string)
			free(previous_string);		

		filename_array[i] = new_string;
	}

	/* Populate the index data structure from the words on each doc
	 * Then Save to an index file                     
	 */
	for (int i = 0; i < number_of_files; i++) {

		/* Check that the filenames are digits */
		int continue_flag = 0;
		char *digit_string = filename_array[i] + strlen(page_directory);
		// Check that every character in the filename is a digit
		for (int j = 0; j < strlen(digit_string); j++) {
			if (!isdigit(digit_string[j])) {
				fprintf(stderr, "This file %s contains something other than a digit \n", filename_array[i]);
				continue_flag = 1;
			}
		}
		if (continue_flag ==1)
			continue;

		// Check that each file in the filename array is a good file
		char *file_name = filename_array[i];
		if (!IsFile(file_name)) {
			fprintf(stderr, "not file\n");	
			continue;
		}

		// Get contents of file into a string
		char *document = LoadDocument(file_name);
		if (document == NULL) {
			continue;
		}

		// Get DocumentID of file (check if bad)
		int document_id = GetDocumentId(file_name, page_directory);
		if (document_id < 0) {
			fprintf(stderr, "Error when converting document id char to integer\n");
			continue;
		}

		// Use GetNext word, with pos variable and buffer, to get every word and add the word to the data structure
		int pos = 0;
		char *word_buffer;
		while ((pos = GetNextWord(document, pos, &word_buffer)) > 0) {
			// Update the index for each word
			// Normalize word then update index with that word
			NormalizeWord(word_buffer);
			UpdateIndex(word_buffer, document_id, index_hashtable);
			free(word_buffer);
		}
		// free the string containing the html and the word in filenamearray
		free(document);
	}

	/* Save to index file, and check that it actually went well */
	if (!SaveIndexToFile(index_hashtable, index_filename)) {
		fprintf(stderr, "Could not save index hashtable to file\n");
		exit(-1);
	}

	for (int i = 0; i < number_of_files; i++) {
		free(filename_array[i]);
	}
	free(filename_array);
	FreeHashTable(index_hashtable);

	if (argc == 3) {
		;
	}
	/* Read index file into data strucutres and save to new index file */
 	else {
 		// Assign 2 filenames
 		read_index_filename = argv[3];
		new_index_filename = argv[4];
		// Read index file into data structures 
		HashTable *read_index = ReadFile(read_index_filename);
		if (read_index == NULL) {
			fprintf(stderr, "Error when reading index file into data structures.\n");
			exit(-1);
		}
		// Save index data structures into new file
		if (!SaveIndexToFile(read_index, new_index_filename)) {
			fprintf(stderr, "Could not save read index file into new index file\n");
			exit(-1);
		}
		
		FreeHashTable(read_index);
    }

	return 0;
}
Example #4
0
int main(int argc, char* argv[])
{
	
    // Program parameter processing
	if(argc != 3){
		printf("Error: Incorrect usage\n");
		printf("Query Usage: ./query [indexed data(eg. indexer.dat)] [html data(eg. data)]\n");
		return 1;
	}

	//Get the supplied directory name.
    	int dirSize = strlen(argv[2]);
    	char htmlDirectory[dirSize + 1];
    	htmlDirectory[0] = '\0';
    	strcat(htmlDirectory, argv[2]);

	//Get the fileName.
	int fileSize = strlen(argv[1]);
	char indexedFile[fileSize + 1];
	indexedFile[0] = '\0';
	strcat(indexedFile, argv[1]);
	
    if(IsFile(indexedFile) == 0){
        printf("Incorrect path for indexed file\n");
        return 1;
    }

   	//Check if the path provided is a valid directory.
	if(IsDir(htmlDirectory) == 0){
		printf("Incorrect path for html directory\n");
        return 1;
	}
        
DocumentNode *final = NULL;
DocumentNode* orList[MAX_INPUT]; //OR's
HashTable *tempHashTable = initHashTable(); 

// recreate the inverted index
tempHashTable = ReadFile(indexedFile); 
int revert = SaveIndexToFile(tempHashTable,indexedFile);

if (revert == 0)
    printf("0 Returned from inverting\n");

printf("Satrting query..\n");

//Queries
char inp[MAX_INPUT];
char buff[MAX_INPUT];
int orFlag;
int orIndex;

//loop until user exits
printf("Query:>");

LABEL:while ((fgets(inp,MAX_INPUT,stdin)))
{ 
  
  printf("Query:>");
 
  for (int index = 0; index < MAX_INPUT; index++){
    orList[index] = NULL;//init list elements to null
  }
  
  orFlag = 999;
  orIndex = 0;
  
  // if its a blank enter
  if (strcmp(inp, "\n") == 0){
    fprintf(stderr, "You entered a blank line. Please enter query words!\n");
    continue;
  }  
 
  // remove trailing newline
  char *pos;
  if ((pos=strchr(inp, '\n')) != NULL){
    *pos = '\0';
  }
  
  // check for the last word
  strcpy(buff, inp);
  char *isLast;
  char *lastWord;
  isLast = strtok(buff, " ");
  
  // find the last word
  while (isLast != NULL){
    lastWord = isLast;
    isLast = strtok(NULL, " ");
    
    // check for AND OR and OR AND consecutively
    if (isLast != NULL)
    {
      if ((strcmp(lastWord, "AND") == 0) || (strcmp(lastWord, "OR") == 0))
      {
        if ((strcmp(isLast, "OR") == 0) || (strcmp(isLast, "AND") == 0))
        {
          fprintf(stderr, "Two consecutive query words is invalid. Please try again.\n");
          goto LABEL;
        }
      }
    }
  }
  
  
  if ((strcmp(lastWord, "AND") == 0) || (strcmp(lastWord, "OR") == 0))
  {
    fprintf(stderr, "Last word in query is invalid: %s\n", lastWord);
    continue;
  }

  
  
  char *words;
  words = strtok(inp, " "); //break input on spaces

  //first word validity
  if ((strcmp(words, "AND") == 0) || (strcmp(words, "OR") == 0))
  {
    fprintf(stderr, "First word in query is invalid: %s\n", words);
    continue;
  }
  NormalizeWord(words);//normalize the first valid word
  final = getDocumentList(words, tempHashTable, final);//init doc list
  
  
  words = strtok(NULL, " ");
  // return the list for a one word query(next = null)
  if (words == NULL)
  { 
    final = querySort(final);//recursive sort
    printResult(final, htmlDirectory);//display
           
    freeDocumentList(final);
    final = NULL;