static struct qaSeq *qaFaRead(char *qaName, char *faName, boolean mustReadQa)
/* Read both QA(C) and FA files. */
{
FILE *f = NULL;
struct qaSeq *qaList = NULL, *qa;
struct hash *hash = newHash(0);
struct qaSeq seq;

/* Read in all the .fa files. */
f = mustOpen(faName, "r");
while (faFastReadNext(f, &seq.dna, &seq.size, &seq.name))
    {
    if (hashLookup(hash, seq.name) != NULL)
        {
	warn("Duplicate %s, ignoring all but first.", seq.name);
	continue;
	}
    AllocVar(qa);
    hashAdd(hash, seq.name, qa);
    qa->name = cloneString(seq.name);
    qa->dna = cloneMem(seq.dna, seq.size+1);
    qa->size = seq.size;
    slAddHead(&qaList, qa);
    }
fclose(f);

/* Read in corresponding .qa files and make sure they correspond.
 * If no file exists then fake it. */
if (qaName)
    {
    if (!mustReadQa && !fileExists(qaName))
	{
	warn("No quality file %s", qaName);
	for (qa = qaList; qa != NULL; qa = qa->next)
	     qaMakeFake(qa);
	}
    else
	{
	if (isQacFile(qaName))
	    fillInQac(qaName, hash, qaList);
	else
	    fillInQa(qaName, hash, qaList);
	}
    }
freeHash(&hash);
slReverse(&qaList);
return qaList;
}
Beispiel #2
0
int main(int argc, char *argv[])
/* Process command line. */
{
    char *inName, *outName, **inNames;
    FILE *in, *out;
    int i, inCount;
    DNA *dna;
    int inSize, outSize;
    int dnaOff;
    char *seqName;
    struct dyString *subSeqName = newDyString(512);
    int maxSize = 100000;

    if (argc < 3)
        usage();
    outName = argv[1];
    inNames = &argv[2];
    inCount = argc-2;
    out = mustOpen(outName, "w");
    for (i=0; i<inCount; ++i)
    {
        inName = inNames[i];
        printf("processing %s", inName);
        in = mustOpen(inName, "r");
        while (faFastReadNext(in, &dna, &inSize, &seqName))
        {
            for (dnaOff = 0; dnaOff < inSize; dnaOff += outSize)
            {
                printf(".");
                fflush(stdout);
                outSize = inSize - dnaOff;
                if (outSize > maxSize) outSize = maxSize;
                dyStringClear(subSeqName);
                dyStringPrintf(subSeqName, "%s.%d", seqName, dnaOff);
                faWriteNext(out, subSeqName->string, dna+dnaOff, outSize);
            }
        }
        fclose(in);
        printf("\n");
    }
}
Beispiel #3
0
struct hash *loadChroms(char *dir)
/* Load zipped chromosome files into memory. */
{
FILE *f;
char fastaScan[16];
safef(fastaScan, sizeof(fastaScan), "*.%s", faExtn);
struct fileInfo *chromEl, *chromList = listDirX(dir, fastaScan, TRUE);
struct hash *chromHash = newHash(0);
struct dnaSeq *seq;
char chrom[128];
char *faName;
int count = 0;

verbose(2, "#    scanning '%s/%s'\n", dir, fastaScan);
for (chromEl = chromList; chromEl != NULL; chromEl = chromEl->next)
    {
    char *fileName = chromEl->name;
    splitPath(fileName, NULL, chrom, NULL);
    chopSuffix(chrom);
    if (startsWith("chr0", chrom)) /* Convert chr01 to chr1, etc. */
	stripChar(chrom, '0');
    if (sameString(chrom, "chrmt"))
        strcpy(chrom, "chr17");
    f = fopen(fileName, "r");
    AllocVar(seq);
    seq->name = cloneString(chrom);
    if (!faFastReadNext(f, &seq->dna, &seq->size, &faName))
        errAbort("Couldn't load sequence from %s", fileName);
    seq->dna = cloneMem(seq->dna, seq->size+1);
    toUpperN(seq->dna, seq->size);
    hashAdd(chromHash, chrom, seq);
    verbose(3, "#    loadChrom %s '%s'\n", fileName, chrom);
    fclose(f);
    f = NULL;
    count++;
    }
if (0 == count)
    errAbort("not fasta files found in '%s/%s'\n", dir, fastaScan);
return chromHash;
}
void fakeOut(char *inName,  char *outName)
/* fakeOut - fake a RepeatMasker .out file based on a N's in .fa file. */
{
FILE *out = mustOpen(outName, "w");
FILE *in = mustOpen(inName, "r");
DNA *dna;
int dnaSize;
char *name;

fprintf(out,
 "   SW  perc perc perc  query     position in query            matching       repeat         position in  repeat\n"
 "score  div. del. ins.  sequence    begin     end    (left)    repeat         class/family    begin   end (left)   ID\n"
 "\n");

while (faFastReadNext(in, &dna, &dnaSize, &name))
    {
    int start = 0, end = 0;
    int i;
    boolean n, lastN = TRUE;

    dna[dnaSize] = 'n';		/* Replace 0 with 'n' to make end condition not a special case. */
    for (i=0; i<=dnaSize; ++i)
        {
	n = (dna[i] == 'n');
	if (n != lastN)
	    {
	    if (n)
	        start = i;
	    else
	        {
		end = i;
		if (i != 0)
		    fprintf(out, " 1000  15.0  2.0  2.0 %-9s  %7d %7d (1234567) +  faked          fake              1     100      1\n",
			name, start+1, end);
		}
	    lastN = n;
	    }
	}
    }
}
int main(int argc, char *argv[])
{
char *genoListName;
char *otherListName;
char *oocFileName;
char *typeName;
char *outName;
struct patSpace *patSpace;
long startTime, endTime;
char **genoList;
int genoListSize;
char *genoListBuf;
char **otherList;
int otherListSize;
char *otherListBuf;
char *genoName;
int i;
int blockCount = 0;
struct dnaSeq **seqListList = NULL, *seq = NULL;
char *outRoot;
struct sqlConnection *conn;
enum ffStringency stringency = ffCdna;
int seedSize = 10;
FILE *out;
boolean noHead = FALSE;
struct repeatTracker *rt;
struct hash *repeatHash = newHash(10);

hostName = getenv("HOST");
pushWarnHandler(warnHandler);

startTime = clock1();
cgiSpoof(&argc, argv);
minMatch = cgiOptionalInt("minMatch", minMatch);
maxBad = cgiOptionalInt("maxBad", maxBad);
minBases = cgiOptionalInt("minBases", minBases);

dnaUtilOpen();

#ifdef DEBUG
/* Hard wire command line input so don't have to type it in each 
 * time run the stupid Gnu debugger. */

genoListName = "pFoo/geno.lst";
otherListName = "pFoo/bacend.lst";
typeName = "genomic";
oocFileName = "/d/biodata/human/10.ooc";
outName = "pFoo/pFoo.psl";

#else

if (argc != 6 && argc != 7)
    usage();

genoListName = argv[1];
otherListName = argv[2];
typeName = argv[3];
oocFileName = argv[4];
if (sameWord(oocFileName, "none"))
    oocFileName = NULL;
outName = argv[5];
if (argc == 7)
    {
    if (sameWord("noHead", argv[6]))
	noHead = TRUE;
    else
	usage();
    }

#endif 

if (sameWord(typeName, "mRNA") || sameWord(typeName, "cDNA"))
    {
    stringency = ffCdna;
    }
else if (sameWord(typeName, "genomic"))
    {
    stringency = ffTight;
    }
else if (sameWord(typeName, "g2g"))
    {
    stringency = ffTight;
    veryTight = TRUE;
    seedSize = 11;
    }
else if (sameString(typeName, "asm"))
    {
    stringency = ffTight;
    avoidSelfSelf = TRUE;
    }
else
    {
    warn("Unrecognized otherType %s\n", typeName);
    usage();
    }

readAllWordsOrFa(genoListName, &genoList, &genoListSize, &genoListBuf);
filterMissingFiles(genoList, &genoListSize);
if (genoListSize <= 0)
    errAbort("There are no files that exist in %s\n", genoListName);
readAllWordsOrFa(otherListName, &otherList, &otherListSize, &otherListBuf);
if (otherListSize <= 0)
    errAbort("There are no files that exist in %s\n", otherListName);
filterMissingFiles(otherList, &otherListSize);
out = mustOpen(outName, "w");
if (!noHead)
    pslWriteHead(out);

AllocArray(seqListList, genoListSize);
for (i=0; i<genoListSize; ++i)
    {
    genoName = genoList[i];
    if (!startsWith("#", genoName)  )
        seqListList[i] = seq = faReadAllDna(genoName);
    for (;seq != NULL; seq = seq->next)
	{
	int size = seq->size;
	char *name = seq->name;
	struct hashEl *hel;
	AllocVar(rt);
	AllocArray(rt->repBytes, size);
	rt->seq = seq;
	if ((hel = hashLookup(repeatHash, name)) != NULL)
	    errAbort("Duplicate %s in %s\n", name, genoName);
	hashAdd(repeatHash, name, rt);
	}
    storeMasked(repeatHash, genoName);
    }

patSpace = makePatSpace(seqListList, genoListSize, seedSize, oocFileName, minMatch, 2000);
endTime = clock1();
printf("Made index in %ld seconds\n",  (endTime-startTime));
startTime = endTime;

for (i=0; i<otherListSize; ++i)
    {
    FILE *f;
    char *otherName;
    int c;
    int dotCount = 0;
    struct dnaSeq otherSeq;
    ZeroVar(&otherSeq);

    otherName = otherList[i];
    if (startsWith("#", otherName)  )
	continue;
    f = mustOpen(otherName, "r");
    while ((c = fgetc(f)) != EOF)
	if (c == '>')
	    break;
    printf("%s\n", otherName);
    fflush(stdout);
    while (faFastReadNext(f, &otherSeq.dna, &otherSeq.size, &otherSeq.name))
        {
	aliSeqName = otherSeq.name;
	oneStrand(patSpace, repeatHash, &otherSeq, FALSE, stringency, out);
	reverseComplement(otherSeq.dna, otherSeq.size);
	oneStrand(patSpace, repeatHash, &otherSeq, TRUE, stringency, out);
	aliSeqName = NULL;
        }
    fclose(f);
    }
freePatSpace(&patSpace);
endTime = clock1();
printf("Alignment time is %ld sec\n", (endTime-startTime));
startTime = endTime;
fclose(out);
return 0;
}