int EludeCaller::Run() { pair<int, double> best_model(-1, -1.0); if (!train_file_.empty() && !load_model_file_.empty() && VERB >= 4 && !linear_calibration_) { cerr << "Warning: a model can be either trained or loaded from a file. " << "The two options should not be used together, unless linear calibration " << "should be carried out. In such a case please use the -j option. " << "The model will be trained using the peptides in " << train_file_ << endl; } // train a retention model if (!train_file_.empty()) { ProcessTrainData(); // initialize the feature table train_features_table_ = DataManager::InitFeatureTable( RetentionFeatures::kMaxNumberFeatures, train_psms_); if (automatic_model_sel_) { best_model = AutomaticModelSelection(); } else if (only_hydrophobicity_index_) { map<string, double> custom_hydrophobicity_index = TrainRetentionIndex(); if (!index_file_.empty()) { SaveRetentionIndexToFile(index_file_, custom_hydrophobicity_index); } else { PrintHydrophobicityIndex(custom_hydrophobicity_index); } cerr << "Now I saved the index" << endl; return 0; } else if (load_model_file_.empty()) { TrainRetentionModel(); } } else if (automatic_model_sel_) { if (!test_file_.empty()) { ProcessTestData(); processed_test_ = true; } best_model = AutomaticModelSelection(); } // load a model from a file if (!load_model_file_.empty() && !automatic_model_sel_) { rt_model_ = new RetentionModel(the_normalizer_); rt_model_->LoadModelFromFile(load_model_file_); } // save the model if (!save_model_file_.empty()) { if (rt_model_ != NULL && !rt_model_->IsModelNull()) { rt_model_->SaveModelToFile(save_model_file_); } else if (VERB >= 2) { cerr << "Warning: No trained model available. Nothing to save to " << save_model_file_ << endl; } } // append a file to the library if (append_model_) { if (automatic_model_sel_) { if (VERB >= 3) { cerr << "Warning: The model should already be in the library if " << "the automatic model selection option is employed. No model " << "will be appended to the library"<< endl; } } else if (rt_model_ == NULL) { if (VERB >= 3) { cerr << "Warning: No model available, nothing to append to the library." << endl; } } else { AddModelLibrary(); } } // save the retention index to a file if (!index_file_.empty()) { SaveIndexToFile(best_model.first); } // test a model if (!test_file_.empty()) { // process the test data if (!processed_test_) { ProcessTestData(); } if (test_psms_.size() <= 0) { if (VERB >= 3) { cerr << "Warning: no test psms available, nothing to do. " << endl; return 0; } } // initialize the feature table test_features_table_ = DataManager::InitFeatureTable( RetentionFeatures::kMaxNumberFeatures, test_psms_); int ret = 1; if (automatic_model_sel_) { int index = best_model.first; if (index < 0) { if (VERB >= 2) { cerr << "Error: No model available to predict rt. Execution aborted." << endl; } return 0; } rt_models_[index]->PredictRT(test_aa_alphabet_, ignore_ptms_, "test psms", test_psms_); if (linear_calibration_ && train_psms_.size() > 1) { rt_models_[index]->PredictRT(train_aa_alphabet_, ignore_ptms_, "calibration psms", train_psms_); } } else { int ret = rt_model_->PredictRT(test_aa_alphabet_, ignore_ptms_, "test psms", test_psms_); if (ret != 0) { if (VERB >= 2) { cerr << "Error: the amino acids alphabet in the test data does not match " <<"the ones used to train the model. Please use the -p option to ignore the ptms " <<"in the test data data are were not present in the training set " << endl; } return 0; } if (linear_calibration_ && train_psms_.size() > 1) { ret = rt_model_->PredictRT(train_aa_alphabet_, ignore_ptms_, "training psms", train_psms_); if (ret != 0) { if (VERB >= 2) { cerr << "Error: the amino acids alphabet in training data does not match " <<"the one used to train the model. Please use the -p option to ignore the ptms " <<"that were not present in the set used to train the model " << endl; } return 0; } } } // linear calibration is performed only for automatic model selection or when // loading a model from a file if (linear_calibration_ && (automatic_model_sel_ || (!load_model_file_.empty() && train_psms_.size() >= 2))) { if (train_psms_.size() <= 1 && !automatic_model_sel_) { if (VERB >= 3) { cerr << "Warning: at least 2 training psms are needed to calibrate the model. " << "No calibration performed. " << endl; } } else { // get the a and b coefficients if (linear_calibration_ && train_psms_.size() < 2) { if (VERB >= 4) { cerr << "Warning: No (enough) calibration peptides. Linear calibration " << "cannot be performed " << endl; } } else { pair<vector<double> , vector<double> > rts = GetRTs(train_psms_); lts = new LTSRegression(); lts->setData(rts.first, rts.second); lts->runLTS(); AdjustLinearly(test_psms_); } } } // compute performance measures if (test_includes_rt_) { double rank_correl = ComputeRankCorrelation(test_psms_); double pearson_correl = ComputePearsonCorrelation(test_psms_); double win = ComputeWindow(test_psms_); if (VERB >= 3) { cerr << "Performance measures for the test data: " << endl; cerr << " Pearson's correlation r = " << pearson_correl << endl; cerr << " Spearman's rank correlation rho = " << rank_correl << endl; cerr << " Delta_t 95% = " << win << endl; } } // write the predictions to file if (!output_file_.empty()) { DataManager::WriteOutFile(output_file_, test_psms_, test_includes_rt_); } else { if (VERB >= 2 && !supress_print_) { PrintPredictions(test_psms_); } } } return 0; }
int main(int argc, char* argv[]) { //check argument number if (argc < 3 || argc > 4) { printf("too many or too little arguments, please try again"); exit(0); } //check directory validity if (!IsDir(argv[1])) { printf("invalid directory, please try again"); exit(0); } //Initialize variables and index int docId; int pos; char *doc; char **filenames = NULL; int num_files = 0; HashTable *WordsFound = calloc(1, sizeof(HashTable)); num_files = GetFilenamesInDir(argv[1], &filenames); //check whether the folder has files if (num_files < 0) { printf("failed to get any filenames"); exit(0); } //iterate through each file in the directory for (int i = 0; i < num_files; i++) { //check that the file is in the correct format (title is a number) int filechecker = 0; for (int c = 0; c < strlen(filenames[i]); c++) { if (!isdigit(filenames[i][c])) { filechecker = 1; } } if (filechecker == 1) { continue; } //Load the document char *word; char file[100]; strcpy(file, argv[1]); strcat(file, filenames[i]); doc = LoadDocument(file); docId = GetDocumentId(filenames[i]); free(filenames[i]); pos = 0; //Iterate through each word in the html file (doc) while ((pos = GetNextWord(doc, pos, &word)) > 0) { NormalizeWord(word); if (InHashTable(word, WordsFound) == 0) { AddToHashTable(word, WordsFound); UpdateHashTable(word, docId, WordsFound); } else { UpdateHashTable(word, docId, WordsFound); free(word); } } free(doc); } free(filenames); SaveIndexToFile(argv[2], WordsFound); //Save the index to the file specified FreeHashTable(WordsFound); //only proceed if there was a third argument specified. If so, reload the index form the file you just created if (argc == 4) { HashTable *ReloadedIndex = ReadFile(argv[2]); SaveIndexToFile(argv[3], ReloadedIndex); FreeHashTable(ReloadedIndex); } return 0; }
int main (int argc, char **argv) { /* Check Arguments */ if (!CheckArguments(argc, argv)) { exit(-1); } /* Make variables for all things needed for indexer and indexer testing */ char *page_directory; char *index_filename; char *read_index_filename; char *new_index_filename; // If argument count is 3 initialize only 2 variables else initialize all page_directory = argv[1]; index_filename = argv[2]; // Initialize hashtable, word node, and document node HashTable *index_hashtable = calloc(1, sizeof(HashTable)); /*Make array to hold filenames (just document numbers) and use GetFilenamesInDir to grab all names */ char **filename_array; int number_of_files; if ((number_of_files = GetFilenamesInDir(page_directory, &filename_array)) < 0) { fprintf(stderr, "Could not get filenames in page directory. Exiting Now.\n"); exit(-1); } /* Add page_directory to the front of the filenames */ for (int i = 0; i < number_of_files; i++) { // Make pointe to current string in filename_array char *previous_string = filename_array[i]; // Get length of full string and initialize element of filename_array to that size int len = strlen(page_directory) + strlen(previous_string) + 1; char *new_string = calloc(len, sizeof(char)); // Make new string and free previous string strcpy(new_string, page_directory); strcat(new_string, previous_string); if (previous_string) free(previous_string); filename_array[i] = new_string; } /* Populate the index data structure from the words on each doc * Then Save to an index file */ for (int i = 0; i < number_of_files; i++) { /* Check that the filenames are digits */ int continue_flag = 0; char *digit_string = filename_array[i] + strlen(page_directory); // Check that every character in the filename is a digit for (int j = 0; j < strlen(digit_string); j++) { if (!isdigit(digit_string[j])) { fprintf(stderr, "This file %s contains something other than a digit \n", filename_array[i]); continue_flag = 1; } } if (continue_flag ==1) continue; // Check that each file in the filename array is a good file char *file_name = filename_array[i]; if (!IsFile(file_name)) { fprintf(stderr, "not file\n"); continue; } // Get contents of file into a string char *document = LoadDocument(file_name); if (document == NULL) { continue; } // Get DocumentID of file (check if bad) int document_id = GetDocumentId(file_name, page_directory); if (document_id < 0) { fprintf(stderr, "Error when converting document id char to integer\n"); continue; } // Use GetNext word, with pos variable and buffer, to get every word and add the word to the data structure int pos = 0; char *word_buffer; while ((pos = GetNextWord(document, pos, &word_buffer)) > 0) { // Update the index for each word // Normalize word then update index with that word NormalizeWord(word_buffer); UpdateIndex(word_buffer, document_id, index_hashtable); free(word_buffer); } // free the string containing the html and the word in filenamearray free(document); } /* Save to index file, and check that it actually went well */ if (!SaveIndexToFile(index_hashtable, index_filename)) { fprintf(stderr, "Could not save index hashtable to file\n"); exit(-1); } for (int i = 0; i < number_of_files; i++) { free(filename_array[i]); } free(filename_array); FreeHashTable(index_hashtable); if (argc == 3) { ; } /* Read index file into data strucutres and save to new index file */ else { // Assign 2 filenames read_index_filename = argv[3]; new_index_filename = argv[4]; // Read index file into data structures HashTable *read_index = ReadFile(read_index_filename); if (read_index == NULL) { fprintf(stderr, "Error when reading index file into data structures.\n"); exit(-1); } // Save index data structures into new file if (!SaveIndexToFile(read_index, new_index_filename)) { fprintf(stderr, "Could not save read index file into new index file\n"); exit(-1); } FreeHashTable(read_index); } return 0; }
int main(int argc, char* argv[]) { // Program parameter processing if(argc != 3){ printf("Error: Incorrect usage\n"); printf("Query Usage: ./query [indexed data(eg. indexer.dat)] [html data(eg. data)]\n"); return 1; } //Get the supplied directory name. int dirSize = strlen(argv[2]); char htmlDirectory[dirSize + 1]; htmlDirectory[0] = '\0'; strcat(htmlDirectory, argv[2]); //Get the fileName. int fileSize = strlen(argv[1]); char indexedFile[fileSize + 1]; indexedFile[0] = '\0'; strcat(indexedFile, argv[1]); if(IsFile(indexedFile) == 0){ printf("Incorrect path for indexed file\n"); return 1; } //Check if the path provided is a valid directory. if(IsDir(htmlDirectory) == 0){ printf("Incorrect path for html directory\n"); return 1; } DocumentNode *final = NULL; DocumentNode* orList[MAX_INPUT]; //OR's HashTable *tempHashTable = initHashTable(); // recreate the inverted index tempHashTable = ReadFile(indexedFile); int revert = SaveIndexToFile(tempHashTable,indexedFile); if (revert == 0) printf("0 Returned from inverting\n"); printf("Satrting query..\n"); //Queries char inp[MAX_INPUT]; char buff[MAX_INPUT]; int orFlag; int orIndex; //loop until user exits printf("Query:>"); LABEL:while ((fgets(inp,MAX_INPUT,stdin))) { printf("Query:>"); for (int index = 0; index < MAX_INPUT; index++){ orList[index] = NULL;//init list elements to null } orFlag = 999; orIndex = 0; // if its a blank enter if (strcmp(inp, "\n") == 0){ fprintf(stderr, "You entered a blank line. Please enter query words!\n"); continue; } // remove trailing newline char *pos; if ((pos=strchr(inp, '\n')) != NULL){ *pos = '\0'; } // check for the last word strcpy(buff, inp); char *isLast; char *lastWord; isLast = strtok(buff, " "); // find the last word while (isLast != NULL){ lastWord = isLast; isLast = strtok(NULL, " "); // check for AND OR and OR AND consecutively if (isLast != NULL) { if ((strcmp(lastWord, "AND") == 0) || (strcmp(lastWord, "OR") == 0)) { if ((strcmp(isLast, "OR") == 0) || (strcmp(isLast, "AND") == 0)) { fprintf(stderr, "Two consecutive query words is invalid. Please try again.\n"); goto LABEL; } } } } if ((strcmp(lastWord, "AND") == 0) || (strcmp(lastWord, "OR") == 0)) { fprintf(stderr, "Last word in query is invalid: %s\n", lastWord); continue; } char *words; words = strtok(inp, " "); //break input on spaces //first word validity if ((strcmp(words, "AND") == 0) || (strcmp(words, "OR") == 0)) { fprintf(stderr, "First word in query is invalid: %s\n", words); continue; } NormalizeWord(words);//normalize the first valid word final = getDocumentList(words, tempHashTable, final);//init doc list words = strtok(NULL, " "); // return the list for a one word query(next = null) if (words == NULL) { final = querySort(final);//recursive sort printResult(final, htmlDirectory);//display freeDocumentList(final); final = NULL;