matrix * copy_matrix_subset(matrix *m, struct slInt * desiredRows, struct slInt * desiredCols) { int i = 0, j = 0; //if desiredRows or desiredCols is NULL that means grab them all. struct slInt *rows = NULL, *cols = NULL; if(desiredRows == NULL) rows = list_indices(m->rows); else rows = desiredRows; if(desiredCols == NULL) cols = list_indices(m->cols); else cols = desiredCols; struct slInt * curr_row = rows; struct slInt * curr_col = cols; int subsetRows = count_indices(rows); int subsetCols = count_indices(cols); matrix * result = init_matrix(subsetRows, subsetCols); //if the source matrix has labels, make space for them in the result matrix if(m->labels) { //copy labels i = 0; while(curr_row != NULL) { safef(result->rowLabels[i++],MAX_LABEL, "%s", m->rowLabels[curr_row->val]); curr_row = curr_row->next; } curr_row = rows; i = 0; while(curr_col != NULL) { safef(result->colLabels[i++],MAX_LABEL,"%s", m->colLabels[curr_col->val]); curr_col = curr_col->next; } curr_col = cols; result->labels=1; } //run over your selected rows and cols, copying from source matrix i=0;i=0; while(curr_row != NULL) { while(curr_col != NULL) { result->graph[i][j++] = m->graph[curr_row->val][curr_col->val]; curr_col = curr_col->next; } i++; j = 0; curr_col = cols; curr_row = curr_row->next; } if(desiredRows == NULL) slFreeList(&rows); if(desiredCols == NULL) slFreeList(&cols); return result; }
int main(int argc, char **argv) { st_manager holder; /*options */ char c; int disp_f = 0; /* shows patterns in coloured text */ int num_f = 0; /* displays the number of occurences */ int ind_f = 0; /* lists all the indices[from 0] of occurence of pattern */ if (argc < 3) { print_usage(argv[0]); return EXIT_FAILURE; } FILE *inp_file = fopen(argv[1], "r"); char *pattern = argv[2]; /* parse arguments */ while ((c = getopt(argc, argv, "dni")) != -1) switch(c) { case 'd': disp_f = 1; break; case 'n': num_f = 1; break; case 'i': ind_f = 1; break; default: break; } /* FILE *inp_file = fopen(argv[1], "r"); char *pattern = argv[2]; */ holder = pat_search(inp_file, pattern); if (num_f) { printf("occurences: %d \n", holder.num_occur); } if (disp_f) { //rewind file as it has already been read fseek(inp_file, 0, SEEK_SET); display_pattern(inp_file, pattern, holder); } if (ind_f) { list_indices(holder); } fclose(inp_file); return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { if(argc < 2) usage(); char * configFile = argv[1]; struct hash * config = raReadSingle(configFile); matrix * data = NULL; if(sameString("bioInt", hashMustFindVal(config, "inputType"))) { char * profile = hashMustFindVal(config, "profile"); //TODO: make this optional so if it's not set it defaults char * db = hashMustFindVal(config, "db"); struct sqlConnection *conn = hAllocConnProfile(profile, db); char *tableName = hashMustFindVal(config, "tableName"); data = bioInt_fill_matrix(conn, tableName); hFreeConn(&conn); } else if(sameString("flatfiles", hashMustFindVal(config, "inputType"))) { char * dataFilepath = hashMustFindVal(config, "dataFilepath"); FILE * dataFile; dataFile = fopen(dataFilepath, "r"); if(dataFile == NULL) errAbort("ERROR: Couldn't open the file \"%s\"\n", dataFilepath); data = f_fill_matrix(dataFile, 1); fclose(dataFile); } else errAbort("Unsupported input type"); int itemsInFold = -1; int folds = foldsCountFromConfig(config); char * cv = hashMustFindVal(config, "crossValidation"); if(sameString("k-fold", cv)) itemsInFold = floor(data->cols/folds); else if(sameString("loo", cv)) itemsInFold = 1; if(folds == -1 || itemsInFold == -1) errAbort("Couldn't assign folds or itemsInFold\n"); if(hashFindVal(config, "excludeList")) { matrix * trimmedData = filterColumnsByExcludeList(config, data); free_matrix(data); data = trimmedData; } struct slInt *list = list_indices(data->cols); int split, splits = splitsCountFromConfig(config); matrix * foldReports = NULL; for(split = 1; split <= splits; split++) { struct slInt *shuffledList = seeded_shuffle_indices(list, split); matrix * tmp = reportFolds(config, data, folds,itemsInFold, shuffledList); if(split == 1) { foldReports = copy_matrix(tmp); } else { matrix * tmp2 = append_matrices(foldReports, tmp, 2); free_matrix(foldReports); foldReports = copy_matrix(tmp2); } slFreeList(&shuffledList); } fprint_discreteMatrix(stdout, foldReports); free_matrix(foldReports); freeHash(&config); slFreeList(&list); return 0; }
/**** WEKA specific functions *********/ matrix * WEKApopulateAccuracyMatrix(struct hash * config, int split, int fold) { char * trainingDir = hashMustFindVal(config, "trainingDir"); char * validationDir = hashMustFindVal(config, "validationDir"); char * modelDir = hashMustFindVal(config, "modelDir"); char filename[256]; //cat togetehr the training and validation KH values and record which were used to train safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.arff", trainingDir, split, fold); matrix * trMetadata = WEKAtoMetadataMatrix(filename); safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.arff", validationDir, split, fold); matrix * valMetadata = WEKAtoMetadataMatrix(filename); matrix * metadata = append_matrices(trMetadata, valMetadata, 1); struct slInt * trainingList = list_indices(trMetadata->cols); //create a labeled matrix for results to be stored in matrix * result = init_matrix(2, metadata->cols); safef(result->rowLabels[0], MAX_LABEL, "trainingAccuracies"); safef(result->rowLabels[1], MAX_LABEL, "testingAccuracies"); copy_matrix_labels(result, metadata, 2,2); result->labels=1; //read the results from file safef(filename, sizeof(filename), "%s/split%02d/fold%02d/weka.training.results", modelDir, split, fold); FILE * fp = fopen(filename, "r"); if(fp == NULL) errAbort("Couldn't open %s for reading.", filename); //advance the cursor to where data starts char * line; while( (line = readLine(fp)) && line != NULL) { if(strstr(line, "inst#") != NULL) break; } //read each result and save to results matrix int i; for(i = 0; i < trMetadata->cols && (line = readLine(fp)) != NULL; i++) { if(strstr(line, ":?") == NULL) { if(strstr(line, " + ") == NULL) result->graph[0][i] = 1; else result->graph[0][i] = 0; } } safef(filename, sizeof(filename), "%s/split%02d/fold%02d/weka.validation.results", modelDir, split, fold); fp = fopen(filename, "r"); if(fp == NULL) errAbort("Couldn't open %s for reading.", filename); //advance the cursor to where data starts while( (line = readLine(fp)) && line != NULL) { if(strstr(line, "inst#") != NULL) break; } //read each result and save to results matrix for(i = i; i < result->cols && (line = readLine(fp)) != NULL; i++) { if(strstr(line, ":?") == NULL) { if(strstr(line, " + ") == NULL) result->graph[1][i] = 1; else result->graph[1][i] = 0; } } free_matrix(trMetadata); free_matrix(valMetadata); free_matrix(metadata); slFreeList(&trainingList); return result; }
matrix * SVMgetSampleAccuracy(struct hash *config) /*Read all the folds and calculate training and testing accuracies from best models*/ { char * trainingDir = hashMustFindVal(config, "trainingDir"); char * validationDir = hashMustFindVal(config, "validationDir"); char * modelDir = hashMustFindVal(config, "modelDir"); int fold, folds = foldsCountFromDataDir(config); int split, splits = splitsCountFromDataDir(config); matrix * accuracies = NULL; char filename[1024]; for(split = 1; split <= splits; split++) { for(fold = 1; fold <= folds; fold++) { //cat togetehr the training and validation KH values and record which were used to train safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.svm", trainingDir, split, fold); matrix * trMetadata = SVMtoMetadataMatrix(filename); safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.svm", validationDir, split, fold); matrix * valMetadata = SVMtoMetadataMatrix(filename); struct slInt * trainingList = list_indices(trMetadata->cols); matrix * metadata = append_matrices(trMetadata, valMetadata, 1); //cat together the guesses from SVM safef(filename, sizeof(filename), "%s/split%02d/fold%02d/svm.training.results", modelDir, split, fold); matrix * trainingPred = SVMpopulatePredictionsMatrix(filename); safef(trainingPred->rowLabels[0],MAX_LABEL, "Prediction"); copy_matrix_labels(trainingPred, trMetadata, 2,2); trainingPred->labels=1; safef(filename, sizeof(filename), "%s/split%02d/fold%02d/svm.validation.results", modelDir, split, fold); matrix * testingPred = SVMpopulatePredictionsMatrix(filename); safef(testingPred->rowLabels[0], MAX_LABEL, "Prediction"); copy_matrix_labels(testingPred, valMetadata, 2,2); testingPred->labels=1; matrix * predictions = append_matrices(trainingPred, testingPred, 1); //get accuracies matrix * accuraciesInFold = SVMpopulateAccuracyMatrix(predictions, metadata, trainingList); //add the accuracies to the running totals if(split == 1 && fold == 1) accuracies = copy_matrix(accuraciesInFold); else add_matrices_by_colLabel(accuracies, accuraciesInFold); //clean up free_matrix(trMetadata); free_matrix(valMetadata); free_matrix(metadata); free_matrix(trainingPred); free_matrix(testingPred); free_matrix(predictions); free_matrix(accuraciesInFold); slFreeList(&trainingList); } } //normalize accuracies over number of splits and folds int i; for(i = 0; i < accuracies->cols; i++) { if(accuracies->graph[0][i] != NULL_FLAG) accuracies->graph[0][i] = (accuracies->graph[0][i] / ((folds-1) * splits)); if(accuracies->graph[1][i] != NULL_FLAG) accuracies->graph[1][i] = (accuracies->graph[1][i] / (1 * splits)); } return accuracies; }
matrix * NMFgetSampleAccuracy(struct hash *config) /*Read all the folds and calculate training and testing accuracies from best models*/ { char * trainingDir = hashMustFindVal(config, "trainingDir"); char * validationDir = hashMustFindVal(config, "validationDir"); char * modelDir = hashMustFindVal(config, "modelDir"); int fold, folds = foldsCountFromDataDir(config); int split, splits = splitsCountFromDataDir(config); matrix * accuracies = NULL; char filename[256]; FILE * fp; for(split = 1; split <= splits; split++) { for(fold = 1; fold <= folds; fold++) { //cat togetehr the training and validation KH values and record which were used to train safef(filename, sizeof(filename), "%s/split%02d/fold%02d/metadata.tab", trainingDir, split, fold); fp = fopen(filename, "r"); if(fp == NULL) errAbort("Couldn't open file %s\n", filename); matrix * trMetadata = f_fill_matrix(fp, 1); fclose(fp); safef(filename, sizeof(filename), "%s/split%02d/fold%02d/metadata.tab", validationDir, split, fold); fp = fopen(filename, "r"); if(fp == NULL) errAbort("Couldn't open file %s\n", filename); matrix * valMetadata = f_fill_matrix(fp, 1); fclose(fp); struct slInt * trainingList = list_indices(trMetadata->cols); matrix * metadata = append_matrices(trMetadata, valMetadata, 1); safef(filename, sizeof(filename), "%s/split%02d/fold%02d/NMFpredictor.training.results", modelDir, split, fold); fp = fopen(filename , "r"); if(!fp) errAbort("Couldn't open training results file %s", filename); matrix * trainingPred = f_fill_matrix(fp, 1); fclose(fp); safef(filename, sizeof(filename), "%s/split%02d/fold%02d/NMFpredictor.validation.results", modelDir, split, fold); fp = fopen(filename , "r"); if(!fp) errAbort("Couldn't open validation results file %s", filename); matrix * valPred = f_fill_matrix(fp, 1); fclose(fp); //calc the accuracy by sample matrix * predictions = append_matrices(trainingPred, valPred, 1); matrix * accuraciesInFold = NMFpopulateAccuracyMatrix(predictions, metadata, trainingList); //add the accuracies to the running totals if(split == 1 && fold == 1) accuracies = copy_matrix(accuraciesInFold); else add_matrices_by_colLabel(accuracies, accuraciesInFold); //clean up free_matrix(trainingPred); free_matrix(valPred); free_matrix(predictions); free_matrix(trMetadata); free_matrix(valMetadata); free_matrix(metadata); free_matrix(accuraciesInFold); } } //normalize accuracies over number of splits and folds int i; for(i = 0; i < accuracies->cols; i++) { if(accuracies->graph[0][i] != NULL_FLAG) accuracies->graph[0][i] = (accuracies->graph[0][i] / ((folds-1) * splits)); if(accuracies->graph[1][i] != NULL_FLAG) accuracies->graph[1][i] = (accuracies->graph[1][i] / (1 * splits)); } return accuracies; }