static struct qaSeq *qaFaRead(char *qaName, char *faName, boolean mustReadQa) /* Read both QA(C) and FA files. */ { FILE *f = NULL; struct qaSeq *qaList = NULL, *qa; struct hash *hash = newHash(0); struct qaSeq seq; /* Read in all the .fa files. */ f = mustOpen(faName, "r"); while (faFastReadNext(f, &seq.dna, &seq.size, &seq.name)) { if (hashLookup(hash, seq.name) != NULL) { warn("Duplicate %s, ignoring all but first.", seq.name); continue; } AllocVar(qa); hashAdd(hash, seq.name, qa); qa->name = cloneString(seq.name); qa->dna = cloneMem(seq.dna, seq.size+1); qa->size = seq.size; slAddHead(&qaList, qa); } fclose(f); /* Read in corresponding .qa files and make sure they correspond. * If no file exists then fake it. */ if (qaName) { if (!mustReadQa && !fileExists(qaName)) { warn("No quality file %s", qaName); for (qa = qaList; qa != NULL; qa = qa->next) qaMakeFake(qa); } else { if (isQacFile(qaName)) fillInQac(qaName, hash, qaList); else fillInQa(qaName, hash, qaList); } } freeHash(&hash); slReverse(&qaList); return qaList; }
int main(int argc, char *argv[]) /* Process command line. */ { char *inName, *outName, **inNames; FILE *in, *out; int i, inCount; DNA *dna; int inSize, outSize; int dnaOff; char *seqName; struct dyString *subSeqName = newDyString(512); int maxSize = 100000; if (argc < 3) usage(); outName = argv[1]; inNames = &argv[2]; inCount = argc-2; out = mustOpen(outName, "w"); for (i=0; i<inCount; ++i) { inName = inNames[i]; printf("processing %s", inName); in = mustOpen(inName, "r"); while (faFastReadNext(in, &dna, &inSize, &seqName)) { for (dnaOff = 0; dnaOff < inSize; dnaOff += outSize) { printf("."); fflush(stdout); outSize = inSize - dnaOff; if (outSize > maxSize) outSize = maxSize; dyStringClear(subSeqName); dyStringPrintf(subSeqName, "%s.%d", seqName, dnaOff); faWriteNext(out, subSeqName->string, dna+dnaOff, outSize); } } fclose(in); printf("\n"); } }
struct hash *loadChroms(char *dir) /* Load zipped chromosome files into memory. */ { FILE *f; char fastaScan[16]; safef(fastaScan, sizeof(fastaScan), "*.%s", faExtn); struct fileInfo *chromEl, *chromList = listDirX(dir, fastaScan, TRUE); struct hash *chromHash = newHash(0); struct dnaSeq *seq; char chrom[128]; char *faName; int count = 0; verbose(2, "# scanning '%s/%s'\n", dir, fastaScan); for (chromEl = chromList; chromEl != NULL; chromEl = chromEl->next) { char *fileName = chromEl->name; splitPath(fileName, NULL, chrom, NULL); chopSuffix(chrom); if (startsWith("chr0", chrom)) /* Convert chr01 to chr1, etc. */ stripChar(chrom, '0'); if (sameString(chrom, "chrmt")) strcpy(chrom, "chr17"); f = fopen(fileName, "r"); AllocVar(seq); seq->name = cloneString(chrom); if (!faFastReadNext(f, &seq->dna, &seq->size, &faName)) errAbort("Couldn't load sequence from %s", fileName); seq->dna = cloneMem(seq->dna, seq->size+1); toUpperN(seq->dna, seq->size); hashAdd(chromHash, chrom, seq); verbose(3, "# loadChrom %s '%s'\n", fileName, chrom); fclose(f); f = NULL; count++; } if (0 == count) errAbort("not fasta files found in '%s/%s'\n", dir, fastaScan); return chromHash; }
void fakeOut(char *inName, char *outName) /* fakeOut - fake a RepeatMasker .out file based on a N's in .fa file. */ { FILE *out = mustOpen(outName, "w"); FILE *in = mustOpen(inName, "r"); DNA *dna; int dnaSize; char *name; fprintf(out, " SW perc perc perc query position in query matching repeat position in repeat\n" "score div. del. ins. sequence begin end (left) repeat class/family begin end (left) ID\n" "\n"); while (faFastReadNext(in, &dna, &dnaSize, &name)) { int start = 0, end = 0; int i; boolean n, lastN = TRUE; dna[dnaSize] = 'n'; /* Replace 0 with 'n' to make end condition not a special case. */ for (i=0; i<=dnaSize; ++i) { n = (dna[i] == 'n'); if (n != lastN) { if (n) start = i; else { end = i; if (i != 0) fprintf(out, " 1000 15.0 2.0 2.0 %-9s %7d %7d (1234567) + faked fake 1 100 1\n", name, start+1, end); } lastN = n; } } } }
int main(int argc, char *argv[]) { char *genoListName; char *otherListName; char *oocFileName; char *typeName; char *outName; struct patSpace *patSpace; long startTime, endTime; char **genoList; int genoListSize; char *genoListBuf; char **otherList; int otherListSize; char *otherListBuf; char *genoName; int i; int blockCount = 0; struct dnaSeq **seqListList = NULL, *seq = NULL; char *outRoot; struct sqlConnection *conn; enum ffStringency stringency = ffCdna; int seedSize = 10; FILE *out; boolean noHead = FALSE; struct repeatTracker *rt; struct hash *repeatHash = newHash(10); hostName = getenv("HOST"); pushWarnHandler(warnHandler); startTime = clock1(); cgiSpoof(&argc, argv); minMatch = cgiOptionalInt("minMatch", minMatch); maxBad = cgiOptionalInt("maxBad", maxBad); minBases = cgiOptionalInt("minBases", minBases); dnaUtilOpen(); #ifdef DEBUG /* Hard wire command line input so don't have to type it in each * time run the stupid Gnu debugger. */ genoListName = "pFoo/geno.lst"; otherListName = "pFoo/bacend.lst"; typeName = "genomic"; oocFileName = "/d/biodata/human/10.ooc"; outName = "pFoo/pFoo.psl"; #else if (argc != 6 && argc != 7) usage(); genoListName = argv[1]; otherListName = argv[2]; typeName = argv[3]; oocFileName = argv[4]; if (sameWord(oocFileName, "none")) oocFileName = NULL; outName = argv[5]; if (argc == 7) { if (sameWord("noHead", argv[6])) noHead = TRUE; else usage(); } #endif if (sameWord(typeName, "mRNA") || sameWord(typeName, "cDNA")) { stringency = ffCdna; } else if (sameWord(typeName, "genomic")) { stringency = ffTight; } else if (sameWord(typeName, "g2g")) { stringency = ffTight; veryTight = TRUE; seedSize = 11; } else if (sameString(typeName, "asm")) { stringency = ffTight; avoidSelfSelf = TRUE; } else { warn("Unrecognized otherType %s\n", typeName); usage(); } readAllWordsOrFa(genoListName, &genoList, &genoListSize, &genoListBuf); filterMissingFiles(genoList, &genoListSize); if (genoListSize <= 0) errAbort("There are no files that exist in %s\n", genoListName); readAllWordsOrFa(otherListName, &otherList, &otherListSize, &otherListBuf); if (otherListSize <= 0) errAbort("There are no files that exist in %s\n", otherListName); filterMissingFiles(otherList, &otherListSize); out = mustOpen(outName, "w"); if (!noHead) pslWriteHead(out); AllocArray(seqListList, genoListSize); for (i=0; i<genoListSize; ++i) { genoName = genoList[i]; if (!startsWith("#", genoName) ) seqListList[i] = seq = faReadAllDna(genoName); for (;seq != NULL; seq = seq->next) { int size = seq->size; char *name = seq->name; struct hashEl *hel; AllocVar(rt); AllocArray(rt->repBytes, size); rt->seq = seq; if ((hel = hashLookup(repeatHash, name)) != NULL) errAbort("Duplicate %s in %s\n", name, genoName); hashAdd(repeatHash, name, rt); } storeMasked(repeatHash, genoName); } patSpace = makePatSpace(seqListList, genoListSize, seedSize, oocFileName, minMatch, 2000); endTime = clock1(); printf("Made index in %ld seconds\n", (endTime-startTime)); startTime = endTime; for (i=0; i<otherListSize; ++i) { FILE *f; char *otherName; int c; int dotCount = 0; struct dnaSeq otherSeq; ZeroVar(&otherSeq); otherName = otherList[i]; if (startsWith("#", otherName) ) continue; f = mustOpen(otherName, "r"); while ((c = fgetc(f)) != EOF) if (c == '>') break; printf("%s\n", otherName); fflush(stdout); while (faFastReadNext(f, &otherSeq.dna, &otherSeq.size, &otherSeq.name)) { aliSeqName = otherSeq.name; oneStrand(patSpace, repeatHash, &otherSeq, FALSE, stringency, out); reverseComplement(otherSeq.dna, otherSeq.size); oneStrand(patSpace, repeatHash, &otherSeq, TRUE, stringency, out); aliSeqName = NULL; } fclose(f); } freePatSpace(&patSpace); endTime = clock1(); printf("Alignment time is %ld sec\n", (endTime-startTime)); startTime = endTime; fclose(out); return 0; }