void readFa(char *fileName, struct rnaCover **retList, struct hash **retHash) /* Read in an FA file and store name and size of every record in hash/list. */ { struct rnaCover *list = NULL, *rc; struct hash *hash = newHash(18); struct lineFile *lf = lineFileOpen(fileName, TRUE); DNA *dna; int size; char *name; while (faSpeedReadNext(lf, &dna, &size, &name)) { if (size >= minSize) { AllocVar(rc); slAddHead(&list, rc); if (hashLookup(hash, name)) { warn("Duplicate %s line %d of %s, skipping", name, lf->lineIx, lf->fileName); continue; } hashAddSaveName(hash, name, rc, &rc->name); rc->qSize = size; } } slReverse(&list); *retList = list; *retHash = hash; }
void splitNcbiFa(char *ncbiIn, char *outDir) /* splitNcbiFa - Split up NCBI format fa file into UCSC formatted ones.. */ { struct lineFile *lf = lineFileOpen(ncbiIn, TRUE); static struct dnaSeq seq; ZeroVar(&seq); makeDir(outDir); while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { FILE *f; char fileName[512]; char *row[5]; int wordCount; char ourName[129]; char cloneName[128]; wordCount = chopByChar(seq.name, '|', row, ArraySize(row)); if (wordCount != 5) errAbort("Expecting 5 | separated fields line %d of %s", lf->lineIx, lf->fileName); strcpy(cloneName, row[3]); chopSuffix(cloneName); sprintf(fileName, "%s/%s.fa", outDir, cloneName); sprintf(ourName, "%s_1", row[3]); faWrite(fileName, ourName, seq.dna, seq.size); } }
void polyInfo(char *pslFile, char *genoFile, char *estFile, char *outputFile) /* polyInfo - Collect info on polyAdenylation signals etc. */ { struct hash *pslHash = NULL; struct hash *genoHash = loadGeno(genoFile); static struct dnaSeq est; struct lineFile *lf = NULL; FILE *f = NULL; pslHash = pslIntoHash(pslFile); lf = lineFileOpen(estFile, TRUE); f = mustOpen(outputFile, "w"); while (faSpeedReadNext(lf, &est.dna, &est.size, &est.name)) { struct pslList *pl; struct psl *psl; struct estOrientInfo ei; if ((pl = hashFindVal(pslHash, est.name)) != NULL) { for (psl = pl->list; psl != NULL; psl = psl->next) { struct dnaSeq *geno = hashMustFindVal(genoHash, psl->tName); if (psl->tSize != geno->size) errAbort("psl generated on a different version of the genome"); ZeroVar(&ei); fillInEstInfo(&ei, &est, geno, psl); estOrientInfoTabOut(&ei, f); } } } }
struct frag *readFragList(char *fileName) /* Read list of frags from file. */ { struct frag *list = NULL, *frag; struct lineFile *lf = lineFileOpen(fileName, TRUE); struct dnaSeq seq; char *s; int fragIx; struct hash *chromHash = newHash(5); ZeroVar(&seq); printf("Reading %s\n", fileName); while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { AllocVar(frag); frag->name = cloneString(seq.name); s = strrchr(seq.name, '_'); if (s == NULL || !isdigit(s[1])) errAbort("Expecting _ and number in %s", seq.name); fragIx = atoi(s+1); frag->chrom = "chr14"; frag->start = fragIx*1000; frag->end = frag->start + 1000; slAddHead(&list, frag); } lineFileClose(&lf); printf("Read %d fragments from %s\n", slCount(list), fileName); slReverse(&list); return list; }
void countSeq(char *fileName, int *retSeqCount, int *retBaseCount) /* Count bases and sequences in fa file. */ { int seqCount = 0, baseCount = 0, oneSize; struct lineFile *lf = lineFileOpen(fileName, TRUE); DNA *dna; char *name; while (faSpeedReadNext(lf, &dna, &oneSize, &name)) { seqCount += 1; baseCount += oneSize; } lineFileClose(&lf); *retSeqCount = seqCount; *retBaseCount = baseCount; }
void correctEst(char *oldFa, char *pslFile, char *nibDir, char *outFa) /* correctEst - Correct ESTs by passing them through genome. */ { struct hash *pslHash = hashPsls(pslFile); struct lineFile *lf = lineFileOpen(oldFa, FALSE); FILE *f = mustOpen(outFa, "w"); static struct dnaSeq est; struct hashEl *hel; struct psl *psl; struct hash *nibHash = newHash(8); while (faSpeedReadNext(lf, &est.dna, &est.size, &est.name)) { if ((psl = hashFindVal(pslHash, est.name)) != NULL) { correctOne(&est, psl, nibDir, nibHash, f); } else { faWriteNext(f, est.name, est.dna, est.size); } } }
void gsBig(char *faName, char *gtfName, char *suboptName, char *transName, char *exeName, char *parName, char *tmpDirName) /* gsBig - Run Genscan on big input and produce GTF files. */ { struct dnaSeq seq; struct lineFile *lf = lineFileOpen(faName, TRUE); FILE *gtfFile = mustOpen(gtfName, "w"); FILE *subFile = NULL; FILE *transFile = NULL; ZeroVar(&seq); if (suboptName != NULL) subFile = mustOpen(suboptName, "w"); if (transName != NULL) transFile = mustOpen(transName, "w"); if (exeName != NULL) exePath = cloneString(exeName); if (parName != NULL) parPath = cloneString(parName); if (tmpDirName != NULL) tmpDir = cloneString(tmpDirName); if (optionExists("prerun")) { char *preFileName = optionVal("prerun", NULL); char seqName[128]; struct segment *seg = parseSegment(preFileName, 0, 100000000, seqName); writeSeg(seqName, seg, gtfFile, subFile, transFile); } else { struct dyString *dy = newDyString(1024); char tempFa[512], tempGs[512]; char dir1[256], root1[128], ext1[64]; int myPid = (int)getpid(); splitPath(faName, dir1, root1, ext1); while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { int offset, sizeOne; struct segment *segList = NULL, *seg; char *seqName = cloneString(seq.name); int chunkNum = 0; for (offset = 0; offset < seq.size; offset += stepSize) { boolean allN = TRUE; int i; safef(tempFa, sizeof(tempFa), "%s/temp_gsBig_%d_%s_%d.fa", tmpDir, myPid, seqName, chunkNum); safef(tempGs, sizeof(tempGs), "%s/temp_gsBig_%d_%s_%d.genscan", tmpDir, myPid, seqName, chunkNum); sizeOne = seq.size - offset; if (sizeOne > winSize) sizeOne = winSize; /* Genscan hangs forever if a chunk is all-N's... if so, * then skip this chunk. */ for (i=offset; i < (offset+sizeOne); i++) { if (seq.dna[i] != 'N' && seq.dna[i] != 'n') { allN = FALSE; break; } } if (allN) { printf("\ngsBig: skipping %s[%d:%d] -- it's all N's.\n\n", seqName, offset, (offset+sizeOne-1)); } else { faWrite(tempFa, "split", seq.dna + offset, sizeOne); dyStringClear(dy); dyStringPrintf(dy, "%s %s %s", exePath, parPath, tempFa); if (suboptName != NULL) dyStringPrintf(dy, " -subopt"); dyStringPrintf(dy, " > %s", tempGs); verbose(3, "%s\n", dy->string); mustSystem(dy->string); seg = parseSegment(tempGs, offset, offset+sizeOne, NULL); slAddHead(&segList, seg); } chunkNum++; } slReverse(&segList); seg = mergeSegs(segList); writeSeg(seqName, seg, gtfFile, subFile, transFile); freez(&seqName); } if (! optionExists("noRemove")) { remove(tempFa); remove(tempGs); } } }
void faCount(char *faFiles[], int faCount) /* faCount - count bases. */ { int f, i, j, k; struct dnaSeq seq; unsigned long long totalLength = 0; unsigned long long totalBaseCount[5]; unsigned long long totalDinucleotideCount[5][5]; unsigned long long totalCpgCount = 0; struct lineFile *lf; ZeroVar(&seq); for (i = 0; i < ArraySize(totalBaseCount); i++) totalBaseCount[i] = 0; for (i = 0; i < ArraySize(totalDinucleotideCount); i++) for (j = 0; j < ArraySize(totalDinucleotideCount[i]); j++) totalDinucleotideCount[i][j] = 0; printf("#seq\tlen\tA\tC\tG\tT\tN\tcpg"); if (dinuc) printf("\tAA\tAC\tAG\tAT\tCA\tCC\tCG\tCT\tGA\tGC\tGG\tGT\tTA\tTC\tTG\tTT"); printf("\n"); dnaUtilOpen(); for (f = 0; f<faCount; ++f) { lf = lineFileOpen(faFiles[f], FALSE); while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name)) { int prevBase = -1; int prevRcBase = -1; unsigned long long length = 0; unsigned long long baseCount[5]; unsigned long long dinucleotideCount[5][5]; unsigned long long cpgCount = 0; for (i = 0; i < ArraySize(baseCount); i++) baseCount[i] = 0; for (i = 0; i < ArraySize(dinucleotideCount); i++) for (j = 0; j < ArraySize(dinucleotideCount[i]); j++) dinucleotideCount[i][j] = 0; for (j=0; j<seq.size; ++j) { int baseVal = ntVal5[(int)(seq.dna[j])]; int rcBaseVal; assert(baseVal != -1); assert(baseVal <= 4); length++; switch(baseVal) { case A_BASE_VAL: rcBaseVal = T_BASE_VAL; break; case C_BASE_VAL: rcBaseVal = G_BASE_VAL; break; case G_BASE_VAL: rcBaseVal = C_BASE_VAL; break; case T_BASE_VAL: rcBaseVal = A_BASE_VAL; break; default: rcBaseVal = N_BASE_VAL; break; } baseCount[baseVal]++; if ((prevBase == C_BASE_VAL) && (baseVal == G_BASE_VAL)) cpgCount++; if (prevBase != -1) dinucleotideCount[prevBase][baseVal]++; if (strands) { length++; baseCount[rcBaseVal]++; if ((prevRcBase == G_BASE_VAL) && (rcBaseVal == C_BASE_VAL)) cpgCount++; if (prevRcBase != -1) dinucleotideCount[rcBaseVal][prevRcBase]++; } prevBase = baseVal; prevRcBase = rcBaseVal; } if (!summary) { printf("%s\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu", seq.name, length, baseCount[A_BASE_VAL], baseCount[C_BASE_VAL], baseCount[G_BASE_VAL], baseCount[T_BASE_VAL], baseCount[N_BASE_VAL], cpgCount); if (dinuc) printf("\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu", dinucleotideCount[A_BASE_VAL][A_BASE_VAL], dinucleotideCount[A_BASE_VAL][C_BASE_VAL], dinucleotideCount[A_BASE_VAL][G_BASE_VAL], dinucleotideCount[A_BASE_VAL][T_BASE_VAL], dinucleotideCount[C_BASE_VAL][A_BASE_VAL], dinucleotideCount[C_BASE_VAL][C_BASE_VAL], dinucleotideCount[C_BASE_VAL][G_BASE_VAL], dinucleotideCount[C_BASE_VAL][T_BASE_VAL], dinucleotideCount[G_BASE_VAL][A_BASE_VAL], dinucleotideCount[G_BASE_VAL][C_BASE_VAL], dinucleotideCount[G_BASE_VAL][G_BASE_VAL], dinucleotideCount[G_BASE_VAL][T_BASE_VAL], dinucleotideCount[T_BASE_VAL][A_BASE_VAL], dinucleotideCount[T_BASE_VAL][C_BASE_VAL], dinucleotideCount[T_BASE_VAL][G_BASE_VAL], dinucleotideCount[T_BASE_VAL][T_BASE_VAL]); printf("\n"); } totalLength += length; totalCpgCount += cpgCount; for (i = 0; i < ArraySize(baseCount); i++) totalBaseCount[i] += baseCount[i]; for (i = 0; i < ArraySize(dinucleotideCount); i++) for (k = 0; k < ArraySize(dinucleotideCount[i]); k++) totalDinucleotideCount[i][k] += dinucleotideCount[i][k]; } lineFileClose(&lf); } printf("total\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu", totalLength, totalBaseCount[A_BASE_VAL], totalBaseCount[C_BASE_VAL], totalBaseCount[G_BASE_VAL], totalBaseCount[T_BASE_VAL], totalBaseCount[N_BASE_VAL], totalCpgCount); if (dinuc) printf("\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu", totalDinucleotideCount[A_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[A_BASE_VAL][C_BASE_VAL], totalDinucleotideCount[A_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[A_BASE_VAL][T_BASE_VAL], totalDinucleotideCount[C_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[C_BASE_VAL][C_BASE_VAL], totalDinucleotideCount[C_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[C_BASE_VAL][T_BASE_VAL], totalDinucleotideCount[G_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[G_BASE_VAL][C_BASE_VAL], totalDinucleotideCount[G_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[G_BASE_VAL][T_BASE_VAL], totalDinucleotideCount[T_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[T_BASE_VAL][C_BASE_VAL], totalDinucleotideCount[T_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[T_BASE_VAL][T_BASE_VAL]); printf("\n"); if (summary) { printf("prcnt\t%-5.1f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f", (float)totalLength/totalLength, ((float)totalBaseCount[A_BASE_VAL])/(float)totalLength, ((float)totalBaseCount[C_BASE_VAL])/(float)totalLength, ((float)totalBaseCount[G_BASE_VAL])/(float)totalLength, ((float)totalBaseCount[T_BASE_VAL])/(float)totalLength, ((float)totalBaseCount[N_BASE_VAL])/(float)totalLength, (float)totalCpgCount/(float)totalLength); if (dinuc) printf("\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f", (float)totalDinucleotideCount[A_BASE_VAL][A_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[A_BASE_VAL][C_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[A_BASE_VAL][G_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[A_BASE_VAL][T_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[C_BASE_VAL][A_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[C_BASE_VAL][C_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[C_BASE_VAL][G_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[C_BASE_VAL][T_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[G_BASE_VAL][A_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[G_BASE_VAL][C_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[G_BASE_VAL][G_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[G_BASE_VAL][T_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[T_BASE_VAL][A_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[T_BASE_VAL][C_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[T_BASE_VAL][G_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[T_BASE_VAL][T_BASE_VAL]/(float)totalLength); printf("\n"); } }