matrix * WEKApopulatePredictionsMatrix(struct hash * config)
{
//quick hack to get proper labes from trianing data. TODO: add a labeling scheme to WEKAwrapper for output
char * trainingDir = hashMustFindVal(config, "trainingDir");
char * modelDir = hashMustFindVal(config, "modelDir");
char filename[1024];
safef(filename, sizeof(filename), "%s/data.arff", trainingDir);
matrix * labelTemplate = WEKAtoMetadataMatrix(filename);
safef(filename, sizeof(filename), "%s/weka.training.results", modelDir);
FILE * fp = fopen(filename, "r");
if(fp == NULL)
    errAbort("Couldn't open %s for reading.", filename);

//read the number of data lines  by advancing the cursor to where data starts, then counting lines
char * line;
while( (line = readLine(fp)) && line != NULL)
    {
    if(strstr(line, "inst#") != NULL)
        break;
    }
int samples = 0;
while( (line = readLine(fp)) && line != NULL && !sameString(line, ""))
	samples++;
if(samples == 0)
	return NULL;//catch where the model doesn't return values because of all null known vals
rewind(fp);

//advance the cursor again
while( (line = readLine(fp)) && line != NULL)
    {
    if(strstr(line, "inst#") != NULL)
        break;
    }

//create target for results
matrix * result = init_matrix(1, samples);
safef(result->rowLabels[0], MAX_LABEL, "prediction");
copy_matrix_labels(result, labelTemplate, 2,2);
result->labels=1;
free_matrix(labelTemplate);

//read each result and save to results matrix
int i;
for(i = 0; i < result->cols && (line = readLine(fp)) != NULL; i++)
    {
    if(strstr(line, ":?") == NULL)
		{
		if(strstr(line, "subgroup 1:subgroup"))
			result->graph[0][i] = 0-atof(lastWordInLine(line));
		else if(strstr(line, "subgroup 2:subgroup"))
			result->graph[0][i] = atof(lastWordInLine(line));
		else
			errAbort("ERROR: Coudln't find a proper class assignment in your WEKA results file.\n");
		}
    }
fclose(fp);
return result;
}
void getTransFromFile(FILE* infile, char *path)
{
FILE* outfile = NULL;
int start, end, tmNumber, count, seqLength;
char *line, *token;
struct dyString *proteinID = newDyString(24);
start = end = tmNumber = count = seqLength = 0;
while( ( line  = readLine(infile) ) != NULL )
    {  /*grab the protein ID after each <PRE> tag*/
    while( (token = nextWord(&line)) != NULL )
        {
   	    if( sameString(token,"<PRE>#"))
                {  /*create a new xml file*/
                token = nextWord(&line);
       	        dyStringAppend(proteinID, token);
                token = lastWordInLine(line);
                seqLength = atoi(token);
                }
            if( sameString(token,"predicted") )
                {   /*grab the number of transmembrane helices*/
                token = lastWordInLine(line);
                tmNumber = atoi(token);
                    if(tmNumber > 0)
                        {
                        outfile = createXMLFile(proteinID->string, path);
                        populateXMLFile(outfile, tmNumber, proteinID->string, path);
                        }
                }
            if( sameString(token,"</PRE>") )
                {  /*close the xml file*/
                carefulClose(&outfile);
  	        dyStringClear(proteinID);
                count = 0;
                }
            if( sameString(token,"TMhelix") )
                {
                    if( (token = nextWord(&line)) != NULL )
                         {  /*get the start*/
                         start = atoi(token);
                              if( (token = nextWord(&line)) != NULL  )
                                  end = atoi(token);
                         }
                    if( count == 0)
                        addCoordinatesToXMLFile(outfile, 1, start-1, "N-term", count);
                    count++;
                    addCoordinatesToXMLFile(outfile, start, end, "TM", count);
                    if( count == tmNumber && outfile != NULL )
                        {
                        addCoordinatesToXMLFile(outfile,end+1,seqLength,"C-term",count);
                        finishXMLFile(outfile, proteinID->string, seqLength);
                        }
                }
        } /*end inner while*/
    }   /*end outer while*/
    freeMem(line);
    freeDyString(&proteinID);
}
Ejemplo n.º 3
0
void extractAccFromGb(char *inName, char* outName, struct hash *accTbl)
/* Parse records of genBank file and print ones that match accession names.
 * (yanked from gbOneAcc, changed to use stdio so we can access compressed). */
{
enum {maxHeadLines=20, headLineSize=256 };
char *headLines[maxHeadLines];	/* Store stuff between locus and accession. */
char line[headLineSize];
FILE *inFh;
FILE *outFh = NULL;
int lineNum = 0;
int i;
char* acc;

verbose(1, "copying from %s\n", inName);

inFh = gzMustOpen(inName, "r");

for (i=0; i<maxHeadLines; ++i)
    headLines[i] = needMem(headLineSize);

while (TRUE)
    {
    boolean gotAcc = FALSE;
    boolean gotMyAcc = FALSE;
    int headLineCount = 0;
    /* Seek to LOCUS */
    for (;;)
	{
	if (!readData(inFh, inName, line, headLineSize, FALSE))
	    break;
        lineNum++;
	if (startsWith("LOCUS", line))
	    break;
	}
    if (feof(inFh))
        break;
    for (i=0; i<maxHeadLines; ++i)
	{
	++headLineCount;
	strcpy(headLines[i], line);
	readData(inFh, inName, line, headLineSize, TRUE);
        lineNum++;
	if (startsWith("ACCESSION", line))
	    {
	    gotAcc = TRUE;
	    break;
	    }
	}
    if (!gotAcc)
	errAbort("LOCUS without ACCESSION in %d lines at line %d of %s",
                 maxHeadLines, lineNum, inName);
    acc = lastWordInLine(line);
    gotMyAcc = (hashLookup(accTbl, acc) != NULL);
    if (gotMyAcc)
	{
        if (outFh == NULL)
            outFh = gbMustOpenOutput(outName);
	for (i=0; i<headLineCount; ++i)
	    {
	    fputs(headLines[i], outFh);
	    fputc('\n', outFh);
	    }
	fputs(line, outFh);
	fputc('\n', outFh);
	}
    for (;;)
	{
	readData(inFh, inName, line, headLineSize, TRUE);
        lineNum++;
	if (gotMyAcc)
	    {
	    fputs(line, outFh);
	    fputc('\n', outFh);
	    }
	if (startsWith("//", line))
	    break;
	}
    if ((outFh != NULL) && ferror(outFh))
        break;  /* write error */
    }
if (outFh != NULL)
    gbOutputRename(outName, &outFh);
gzClose(&inFh);
}
Ejemplo n.º 4
0
static boolean minFreqFail(struct vcfRecord *record, double minFreq)
/* Return TRUE if record's INFO include AF (alternate allele frequencies) or AC+AN
 * (alternate allele counts and total count of observed alleles) and the minor allele
 * frequency < minFreq -- or rather, major allele frequency > (1 - minFreq) because
 * variants with > 2 alleles might have some significant minor frequencies along with
 * tiny minor frequencies). */
{
struct vcfFile *vcff = record->file;
boolean gotInfo = FALSE;
double refFreq = 1.0;
double maxAltFreq = 0.0;
int i;
const struct vcfInfoElement *afEl = vcfRecordFindInfo(record, "AF");
const struct vcfInfoDef *afDef = vcfInfoDefForKey(vcff, "AF");
if (afEl != NULL && afDef != NULL && afDef->type == vcfInfoFloat)
    {
    // If INFO includes alt allele freqs, use them directly.
    gotInfo = TRUE;
    for (i = 0;  i < afEl->count;  i++)
	{
	if (afEl->missingData[i])
	    continue;
	double altFreq = afEl->values[i].datFloat;
	refFreq -= altFreq;
	if (altFreq > maxAltFreq)
	    maxAltFreq = altFreq;
	}
    }
else
    {
    // Calculate alternate allele freqs from AC and AN:
    const struct vcfInfoElement *acEl = vcfRecordFindInfo(record, "AC");
    const struct vcfInfoDef *acDef = vcfInfoDefForKey(vcff, "AC");
    const struct vcfInfoElement *anEl = vcfRecordFindInfo(record, "AN");
    const struct vcfInfoDef *anDef = vcfInfoDefForKey(vcff, "AN");
    if (acEl != NULL && acDef != NULL && acDef->type == vcfInfoInteger &&
	anEl != NULL && anDef != NULL && anDef->type == vcfInfoInteger && anEl->count == 1 &&
	anEl->missingData[0] == FALSE)
	{
	gotInfo = TRUE;
	int totalCount = anEl->values[0].datInt;
	for (i = 0;  i < acEl->count;  i++)
	    {
	    if (acEl->missingData[i])
		continue;
	    int altCount = acEl->values[i].datInt;
	    double altFreq = (double)altCount / totalCount;
	    refFreq -= altFreq;
	    if (altFreq < maxAltFreq)
		maxAltFreq = altFreq;
	    }
	}
    else
        // Use MAF for alternate allele freqs from MAF:
        {
        const struct vcfInfoElement *mafEl = vcfRecordFindInfo(record, "MAF");
        const struct vcfInfoDef *mafDef = vcfInfoDefForKey(vcff, "MAF");
        if (mafEl != NULL && mafDef != NULL && mafDef->type == vcfInfoString
        && startsWith("Minor Allele Frequency",mafDef->description))
            {
            // If INFO includes alt allele freqs, use them directly.
            gotInfo = TRUE;

            if (mafEl->count >= 1 && !mafEl->missingData[mafEl->count-1])
                {
                char data[64];
                safecpy(data,sizeof(data),mafEl->values[mafEl->count-1].datString);
                maxAltFreq = atof(lastWordInLine(data));
                refFreq -= maxAltFreq;
                }
            }
        }
    }
if (gotInfo)
    {
    double majorAlFreq = max(refFreq, maxAltFreq);
    if (majorAlFreq > (1.0 - minFreq))
	return TRUE;
    }
return FALSE;
}