matrix * copy_matrix_subset(matrix *m, struct slInt * desiredRows, struct slInt * desiredCols)
{
int i = 0, j = 0;
//if desiredRows or desiredCols is NULL that means grab them all.
struct slInt *rows = NULL, *cols = NULL;
if(desiredRows == NULL)
	rows = list_indices(m->rows);
else
	rows = desiredRows;
if(desiredCols == NULL)
	cols = list_indices(m->cols);
else
	cols = desiredCols;
struct slInt * curr_row = rows;
struct slInt * curr_col = cols;
int subsetRows = count_indices(rows);
int subsetCols = count_indices(cols);
matrix * result = init_matrix(subsetRows, subsetCols);

//if the source matrix has labels, make space for them in the result matrix
if(m->labels)
	{
	//copy labels	
	i = 0;
	while(curr_row != NULL)
		{
		safef(result->rowLabels[i++],MAX_LABEL, "%s", m->rowLabels[curr_row->val]);
		curr_row = curr_row->next;
		}
	curr_row = rows;
	i = 0;
	while(curr_col != NULL)
		{
        safef(result->colLabels[i++],MAX_LABEL,"%s", m->colLabels[curr_col->val]);
		curr_col = curr_col->next;
		}
	curr_col = cols;
	result->labels=1;
	}
//run over your selected rows and cols, copying from source matrix
i=0;i=0;
while(curr_row != NULL)
	{
	while(curr_col != NULL)
		{
		result->graph[i][j++] = m->graph[curr_row->val][curr_col->val];
		curr_col = curr_col->next;
		}
	i++;
	j = 0;
	curr_col = cols;
	curr_row = curr_row->next;
	}
if(desiredRows == NULL)
	slFreeList(&rows);
if(desiredCols == NULL)
	slFreeList(&cols);
return result;
}
Exemplo n.º 2
0
int main(int argc, char **argv)
{
    st_manager holder;

    /*options */
    char c;
    int disp_f = 0;     /* shows patterns in coloured text */
    int num_f = 0;      /* displays the number of occurences */
    int ind_f = 0;      /* lists all the indices[from 0] of occurence of pattern */

    if (argc < 3) {
        print_usage(argv[0]);
        return EXIT_FAILURE;
    }

    FILE *inp_file = fopen(argv[1], "r");
    char *pattern = argv[2];

    /* parse arguments */
    while ((c = getopt(argc, argv, "dni")) != -1)
        switch(c)
        {
            case 'd':
                disp_f = 1;
                break;
            case 'n':
                num_f = 1;
                break;
            case 'i':
                ind_f = 1;
                break;
            default:
                break;

        }

    /*
    FILE *inp_file = fopen(argv[1], "r");
    char *pattern = argv[2];
    */

    holder = pat_search(inp_file, pattern);
    
    if (num_f) { printf("occurences: %d \n", holder.num_occur); }

    if (disp_f) {
        //rewind file as it has already been read
        fseek(inp_file, 0, SEEK_SET);
        display_pattern(inp_file, pattern, holder);
    }

    if (ind_f) {
        list_indices(holder);
    }

    fclose(inp_file);

    return EXIT_SUCCESS;
}
int main(int argc, char *argv[])
{
if(argc < 2) 
	usage();
char * configFile = argv[1];
struct hash * config = raReadSingle(configFile);

matrix * data = NULL;
if(sameString("bioInt", hashMustFindVal(config, "inputType")))
	{
	char * profile = hashMustFindVal(config, "profile"); //TODO: make this optional so if it's not set it defaults
	char * db = hashMustFindVal(config, "db");
	struct sqlConnection *conn = hAllocConnProfile(profile, db);
	char *tableName = hashMustFindVal(config, "tableName");
	data = bioInt_fill_matrix(conn, tableName);
	hFreeConn(&conn);
	}
else if(sameString("flatfiles", hashMustFindVal(config, "inputType")))
	{
	char * dataFilepath = hashMustFindVal(config, "dataFilepath");
	FILE * dataFile;
	dataFile = fopen(dataFilepath, "r");
	if(dataFile == NULL)
   		errAbort("ERROR: Couldn't open the file \"%s\"\n", dataFilepath);
	data = f_fill_matrix(dataFile, 1);
	fclose(dataFile);
	}
else
	errAbort("Unsupported input type");

int itemsInFold = -1;
int folds = foldsCountFromConfig(config);
char * cv = hashMustFindVal(config, "crossValidation");
if(sameString("k-fold", cv))
    itemsInFold = floor(data->cols/folds);
else if(sameString("loo", cv))
    itemsInFold = 1;
if(folds == -1 || itemsInFold == -1)
    errAbort("Couldn't assign folds or itemsInFold\n");

if(hashFindVal(config, "excludeList"))
    {
    matrix * trimmedData = filterColumnsByExcludeList(config, data);
    free_matrix(data);
    data = trimmedData;
    }

struct slInt *list = list_indices(data->cols); 

int split, splits = splitsCountFromConfig(config);
matrix * foldReports = NULL;
for(split = 1; split <= splits; split++)
	{
	struct slInt *shuffledList = seeded_shuffle_indices(list, split);

	matrix * tmp = reportFolds(config, data, folds,itemsInFold, shuffledList);
	if(split == 1)
		{
		foldReports = copy_matrix(tmp);
		}
	else
		{
		matrix * tmp2 = append_matrices(foldReports, tmp, 2);
		free_matrix(foldReports);
		foldReports = copy_matrix(tmp2);
		}
	slFreeList(&shuffledList);
	}
fprint_discreteMatrix(stdout, foldReports);

free_matrix(foldReports);
freeHash(&config);
slFreeList(&list);
return 0;
}
/**** WEKA specific functions *********/
matrix * WEKApopulateAccuracyMatrix(struct hash * config, int split, int fold)
{
char * trainingDir = hashMustFindVal(config, "trainingDir");
char * validationDir = hashMustFindVal(config, "validationDir");
char * modelDir = hashMustFindVal(config, "modelDir");
char filename[256];

//cat togetehr the training and validation KH values and record which were used to train
safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.arff", trainingDir, split, fold);
matrix * trMetadata = WEKAtoMetadataMatrix(filename);
safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.arff", validationDir, split, fold);
matrix * valMetadata = WEKAtoMetadataMatrix(filename);
matrix * metadata = append_matrices(trMetadata, valMetadata, 1);
struct slInt * trainingList = list_indices(trMetadata->cols);
	
//create a labeled matrix for results to be stored in
matrix * result = init_matrix(2, metadata->cols);
safef(result->rowLabels[0], MAX_LABEL, "trainingAccuracies");
safef(result->rowLabels[1], MAX_LABEL, "testingAccuracies");
copy_matrix_labels(result, metadata, 2,2);
result->labels=1;

//read the results from file
safef(filename, sizeof(filename), "%s/split%02d/fold%02d/weka.training.results", modelDir, split, fold);
FILE * fp = fopen(filename, "r");
if(fp == NULL)
	errAbort("Couldn't open %s for reading.", filename);
//advance the cursor to where data starts
char * line;
while( (line = readLine(fp)) && line != NULL)
	{
	if(strstr(line, "inst#") != NULL)
		break;
	}
//read each result and save to results matrix
int i;
for(i = 0; i < trMetadata->cols && (line = readLine(fp)) != NULL; i++)
	{
	if(strstr(line, ":?") == NULL)
		{
		if(strstr(line, " + ") == NULL)
			result->graph[0][i] = 1;
		else
			result->graph[0][i] = 0;
		}
	}

safef(filename, sizeof(filename), "%s/split%02d/fold%02d/weka.validation.results", modelDir, split, fold);
fp = fopen(filename, "r");
if(fp == NULL)
    errAbort("Couldn't open %s for reading.", filename);
//advance the cursor to where data starts
while( (line = readLine(fp)) && line != NULL)
    {
    if(strstr(line, "inst#") != NULL)
        break;
    }
//read each result and save to results matrix
for(i = i; i < result->cols && (line = readLine(fp)) != NULL; i++)
    {
    if(strstr(line, ":?") == NULL)
        {
        if(strstr(line, " + ") == NULL)
            result->graph[1][i] = 1;
        else
            result->graph[1][i] = 0;
        }
    }


free_matrix(trMetadata);
free_matrix(valMetadata);
free_matrix(metadata);
slFreeList(&trainingList);

return result;
}
matrix * SVMgetSampleAccuracy(struct hash *config)
/*Read all the folds and calculate training and testing accuracies from best models*/
{
char * trainingDir = hashMustFindVal(config, "trainingDir");
char * validationDir = hashMustFindVal(config, "validationDir");
char * modelDir = hashMustFindVal(config, "modelDir");
int fold, folds = foldsCountFromDataDir(config);
int split, splits = splitsCountFromDataDir(config);

matrix * accuracies = NULL;
char filename[1024];
for(split = 1; split <= splits; split++)
	{
	for(fold = 1; fold <= folds; fold++)
		{
		//cat togetehr the training and validation KH values and record which were used to train
		safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.svm", trainingDir, split, fold);
	    matrix * trMetadata = SVMtoMetadataMatrix(filename);
	
		safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.svm", validationDir, split, fold);
	    matrix * valMetadata = SVMtoMetadataMatrix(filename);
	    struct slInt * trainingList = list_indices(trMetadata->cols);
		matrix * metadata = append_matrices(trMetadata, valMetadata, 1);
	
		//cat together the guesses from SVM
		safef(filename, sizeof(filename), "%s/split%02d/fold%02d/svm.training.results", modelDir, split, fold);
		matrix * trainingPred = SVMpopulatePredictionsMatrix(filename);
		safef(trainingPred->rowLabels[0],MAX_LABEL, "Prediction");
		copy_matrix_labels(trainingPred, trMetadata, 2,2);
		trainingPred->labels=1;

	    safef(filename, sizeof(filename), "%s/split%02d/fold%02d/svm.validation.results", modelDir, split, fold);
		matrix * testingPred = SVMpopulatePredictionsMatrix(filename);
		safef(testingPred->rowLabels[0], MAX_LABEL, "Prediction");
		copy_matrix_labels(testingPred, valMetadata, 2,2);
		testingPred->labels=1;

	    matrix * predictions = append_matrices(trainingPred, testingPred, 1);
		//get accuracies
		matrix * accuraciesInFold = SVMpopulateAccuracyMatrix(predictions, metadata, trainingList);
		
		//add the accuracies to the running totals
		if(split == 1 && fold == 1)
			accuracies = copy_matrix(accuraciesInFold);
		else
		    add_matrices_by_colLabel(accuracies, accuraciesInFold);
		
		//clean up
	    free_matrix(trMetadata);
	    free_matrix(valMetadata);
		free_matrix(metadata);
	    free_matrix(trainingPred);
	    free_matrix(testingPred);
		free_matrix(predictions);
		free_matrix(accuraciesInFold);
		slFreeList(&trainingList);
		}
	}

//normalize accuracies over number of splits and folds
int i;
for(i = 0; i < accuracies->cols; i++)
    {
    if(accuracies->graph[0][i] != NULL_FLAG)
        accuracies->graph[0][i] = (accuracies->graph[0][i] / ((folds-1) * splits));
    if(accuracies->graph[1][i] != NULL_FLAG)
        accuracies->graph[1][i] = (accuracies->graph[1][i] / (1 * splits));
    }
return accuracies;
}
matrix * NMFgetSampleAccuracy(struct hash *config)
/*Read all the folds and calculate training and testing accuracies from best models*/
{
char * trainingDir = hashMustFindVal(config, "trainingDir");
char * validationDir = hashMustFindVal(config, "validationDir");
char * modelDir = hashMustFindVal(config, "modelDir");
int fold, folds = foldsCountFromDataDir(config);
int split, splits  = splitsCountFromDataDir(config);

matrix * accuracies = NULL;
char filename[256];
FILE * fp;
for(split = 1; split <= splits; split++)
    {
    for(fold = 1; fold <= folds; fold++)
        {
        //cat togetehr the training and validation KH values and record which were used to train
        safef(filename, sizeof(filename), "%s/split%02d/fold%02d/metadata.tab", trainingDir, split, fold);
        fp = fopen(filename, "r");
        if(fp == NULL)
            errAbort("Couldn't open file %s\n", filename);
        matrix * trMetadata = f_fill_matrix(fp, 1);
        fclose(fp);

        safef(filename, sizeof(filename), "%s/split%02d/fold%02d/metadata.tab", validationDir, split, fold);
        fp = fopen(filename, "r");
        if(fp == NULL)
            errAbort("Couldn't open file %s\n", filename);
        matrix * valMetadata = f_fill_matrix(fp, 1);
        fclose(fp);

        struct slInt * trainingList = list_indices(trMetadata->cols);
        matrix * metadata = append_matrices(trMetadata, valMetadata, 1);

        safef(filename, sizeof(filename), "%s/split%02d/fold%02d/NMFpredictor.training.results", modelDir, split, fold);
        fp = fopen(filename , "r");
        if(!fp)
            errAbort("Couldn't open training results file %s", filename);
        matrix * trainingPred = f_fill_matrix(fp, 1);
        fclose(fp);

        safef(filename, sizeof(filename), "%s/split%02d/fold%02d/NMFpredictor.validation.results", modelDir, split, fold);
        fp = fopen(filename , "r");
        if(!fp)
            errAbort("Couldn't open validation results file %s", filename);
        matrix * valPred = f_fill_matrix(fp, 1);
        fclose(fp);

        //calc the accuracy by sample
        matrix * predictions = append_matrices(trainingPred, valPred, 1);
        matrix * accuraciesInFold = NMFpopulateAccuracyMatrix(predictions, metadata, trainingList);
        //add the accuracies to the running totals
        if(split == 1 && fold == 1)
            accuracies = copy_matrix(accuraciesInFold);
        else
            add_matrices_by_colLabel(accuracies, accuraciesInFold);
		//clean up
        free_matrix(trainingPred);
        free_matrix(valPred);
        free_matrix(predictions);
        free_matrix(trMetadata);
        free_matrix(valMetadata);
        free_matrix(metadata);
        free_matrix(accuraciesInFold);
        }
    }
//normalize accuracies over number of splits and folds
int i;
for(i = 0; i < accuracies->cols; i++)
    {
    if(accuracies->graph[0][i] != NULL_FLAG)
        accuracies->graph[0][i] = (accuracies->graph[0][i] / ((folds-1) * splits));
    if(accuracies->graph[1][i] != NULL_FLAG)
        accuracies->graph[1][i] = (accuracies->graph[1][i] / (1 * splits));
    }
return accuracies;
}