matrix * WEKApopulatePredictionsMatrix(struct hash * config) { //quick hack to get proper labes from trianing data. TODO: add a labeling scheme to WEKAwrapper for output char * trainingDir = hashMustFindVal(config, "trainingDir"); char * modelDir = hashMustFindVal(config, "modelDir"); char filename[1024]; safef(filename, sizeof(filename), "%s/data.arff", trainingDir); matrix * labelTemplate = WEKAtoMetadataMatrix(filename); safef(filename, sizeof(filename), "%s/weka.training.results", modelDir); FILE * fp = fopen(filename, "r"); if(fp == NULL) errAbort("Couldn't open %s for reading.", filename); //read the number of data lines by advancing the cursor to where data starts, then counting lines char * line; while( (line = readLine(fp)) && line != NULL) { if(strstr(line, "inst#") != NULL) break; } int samples = 0; while( (line = readLine(fp)) && line != NULL && !sameString(line, "")) samples++; if(samples == 0) return NULL;//catch where the model doesn't return values because of all null known vals rewind(fp); //advance the cursor again while( (line = readLine(fp)) && line != NULL) { if(strstr(line, "inst#") != NULL) break; } //create target for results matrix * result = init_matrix(1, samples); safef(result->rowLabels[0], MAX_LABEL, "prediction"); copy_matrix_labels(result, labelTemplate, 2,2); result->labels=1; free_matrix(labelTemplate); //read each result and save to results matrix int i; for(i = 0; i < result->cols && (line = readLine(fp)) != NULL; i++) { if(strstr(line, ":?") == NULL) { if(strstr(line, "subgroup 1:subgroup")) result->graph[0][i] = 0-atof(lastWordInLine(line)); else if(strstr(line, "subgroup 2:subgroup")) result->graph[0][i] = atof(lastWordInLine(line)); else errAbort("ERROR: Coudln't find a proper class assignment in your WEKA results file.\n"); } } fclose(fp); return result; }
void getTransFromFile(FILE* infile, char *path) { FILE* outfile = NULL; int start, end, tmNumber, count, seqLength; char *line, *token; struct dyString *proteinID = newDyString(24); start = end = tmNumber = count = seqLength = 0; while( ( line = readLine(infile) ) != NULL ) { /*grab the protein ID after each <PRE> tag*/ while( (token = nextWord(&line)) != NULL ) { if( sameString(token,"<PRE>#")) { /*create a new xml file*/ token = nextWord(&line); dyStringAppend(proteinID, token); token = lastWordInLine(line); seqLength = atoi(token); } if( sameString(token,"predicted") ) { /*grab the number of transmembrane helices*/ token = lastWordInLine(line); tmNumber = atoi(token); if(tmNumber > 0) { outfile = createXMLFile(proteinID->string, path); populateXMLFile(outfile, tmNumber, proteinID->string, path); } } if( sameString(token,"</PRE>") ) { /*close the xml file*/ carefulClose(&outfile); dyStringClear(proteinID); count = 0; } if( sameString(token,"TMhelix") ) { if( (token = nextWord(&line)) != NULL ) { /*get the start*/ start = atoi(token); if( (token = nextWord(&line)) != NULL ) end = atoi(token); } if( count == 0) addCoordinatesToXMLFile(outfile, 1, start-1, "N-term", count); count++; addCoordinatesToXMLFile(outfile, start, end, "TM", count); if( count == tmNumber && outfile != NULL ) { addCoordinatesToXMLFile(outfile,end+1,seqLength,"C-term",count); finishXMLFile(outfile, proteinID->string, seqLength); } } } /*end inner while*/ } /*end outer while*/ freeMem(line); freeDyString(&proteinID); }
void extractAccFromGb(char *inName, char* outName, struct hash *accTbl) /* Parse records of genBank file and print ones that match accession names. * (yanked from gbOneAcc, changed to use stdio so we can access compressed). */ { enum {maxHeadLines=20, headLineSize=256 }; char *headLines[maxHeadLines]; /* Store stuff between locus and accession. */ char line[headLineSize]; FILE *inFh; FILE *outFh = NULL; int lineNum = 0; int i; char* acc; verbose(1, "copying from %s\n", inName); inFh = gzMustOpen(inName, "r"); for (i=0; i<maxHeadLines; ++i) headLines[i] = needMem(headLineSize); while (TRUE) { boolean gotAcc = FALSE; boolean gotMyAcc = FALSE; int headLineCount = 0; /* Seek to LOCUS */ for (;;) { if (!readData(inFh, inName, line, headLineSize, FALSE)) break; lineNum++; if (startsWith("LOCUS", line)) break; } if (feof(inFh)) break; for (i=0; i<maxHeadLines; ++i) { ++headLineCount; strcpy(headLines[i], line); readData(inFh, inName, line, headLineSize, TRUE); lineNum++; if (startsWith("ACCESSION", line)) { gotAcc = TRUE; break; } } if (!gotAcc) errAbort("LOCUS without ACCESSION in %d lines at line %d of %s", maxHeadLines, lineNum, inName); acc = lastWordInLine(line); gotMyAcc = (hashLookup(accTbl, acc) != NULL); if (gotMyAcc) { if (outFh == NULL) outFh = gbMustOpenOutput(outName); for (i=0; i<headLineCount; ++i) { fputs(headLines[i], outFh); fputc('\n', outFh); } fputs(line, outFh); fputc('\n', outFh); } for (;;) { readData(inFh, inName, line, headLineSize, TRUE); lineNum++; if (gotMyAcc) { fputs(line, outFh); fputc('\n', outFh); } if (startsWith("//", line)) break; } if ((outFh != NULL) && ferror(outFh)) break; /* write error */ } if (outFh != NULL) gbOutputRename(outName, &outFh); gzClose(&inFh); }
static boolean minFreqFail(struct vcfRecord *record, double minFreq) /* Return TRUE if record's INFO include AF (alternate allele frequencies) or AC+AN * (alternate allele counts and total count of observed alleles) and the minor allele * frequency < minFreq -- or rather, major allele frequency > (1 - minFreq) because * variants with > 2 alleles might have some significant minor frequencies along with * tiny minor frequencies). */ { struct vcfFile *vcff = record->file; boolean gotInfo = FALSE; double refFreq = 1.0; double maxAltFreq = 0.0; int i; const struct vcfInfoElement *afEl = vcfRecordFindInfo(record, "AF"); const struct vcfInfoDef *afDef = vcfInfoDefForKey(vcff, "AF"); if (afEl != NULL && afDef != NULL && afDef->type == vcfInfoFloat) { // If INFO includes alt allele freqs, use them directly. gotInfo = TRUE; for (i = 0; i < afEl->count; i++) { if (afEl->missingData[i]) continue; double altFreq = afEl->values[i].datFloat; refFreq -= altFreq; if (altFreq > maxAltFreq) maxAltFreq = altFreq; } } else { // Calculate alternate allele freqs from AC and AN: const struct vcfInfoElement *acEl = vcfRecordFindInfo(record, "AC"); const struct vcfInfoDef *acDef = vcfInfoDefForKey(vcff, "AC"); const struct vcfInfoElement *anEl = vcfRecordFindInfo(record, "AN"); const struct vcfInfoDef *anDef = vcfInfoDefForKey(vcff, "AN"); if (acEl != NULL && acDef != NULL && acDef->type == vcfInfoInteger && anEl != NULL && anDef != NULL && anDef->type == vcfInfoInteger && anEl->count == 1 && anEl->missingData[0] == FALSE) { gotInfo = TRUE; int totalCount = anEl->values[0].datInt; for (i = 0; i < acEl->count; i++) { if (acEl->missingData[i]) continue; int altCount = acEl->values[i].datInt; double altFreq = (double)altCount / totalCount; refFreq -= altFreq; if (altFreq < maxAltFreq) maxAltFreq = altFreq; } } else // Use MAF for alternate allele freqs from MAF: { const struct vcfInfoElement *mafEl = vcfRecordFindInfo(record, "MAF"); const struct vcfInfoDef *mafDef = vcfInfoDefForKey(vcff, "MAF"); if (mafEl != NULL && mafDef != NULL && mafDef->type == vcfInfoString && startsWith("Minor Allele Frequency",mafDef->description)) { // If INFO includes alt allele freqs, use them directly. gotInfo = TRUE; if (mafEl->count >= 1 && !mafEl->missingData[mafEl->count-1]) { char data[64]; safecpy(data,sizeof(data),mafEl->values[mafEl->count-1].datString); maxAltFreq = atof(lastWordInLine(data)); refFreq -= maxAltFreq; } } } } if (gotInfo) { double majorAlFreq = max(refFreq, maxAltFreq); if (majorAlFreq > (1.0 - minFreq)) return TRUE; } return FALSE; }