Ejemplo n.º 1
0
void aliTrack(char *bacAcc, char *wholeName, char *partsName, 
    struct memGfx *mg, int x, int y, FILE *mapFile, int trim, char *repeatMask)
/* Write out one alignment track. */
{
struct dnaSeq *whole, *partList, *part;
bits16 contig;
int maxBlockSize = 5000;
int wholeSize;
struct patSpace *ps;
DNA *wholeDna;

whole = faReadAllDna(wholeName);
if (slCount(whole) > 1)
    warn("%d sequences in %s, only using first", slCount(whole), wholeName);
wholeDna = whole->dna;
wholeSize = whole->size;
ps = makePatSpace(&whole, 1, oocFile, 5, 500);
partList = faReadAllDna(partsName);
printf("%d contigs in %s\n\n", slCount(partList), partsName);

for (part = partList, contig = 0; part != NULL; part = part->next, ++contig)
    {
    DNA *dna = part->dna;
    int dnaSize = part->size;
    int start, size;
    int subIx = 0;
    char numText[12];

    Color color = blockColors[contig%ArraySize(blockColors)];
    sprintf(numText, "%d", contig+1);
    for (start = trim; start < dnaSize-trim; start += size)
        {
        struct ffAli *left, *right;
        boolean rc;
        int score;

        size = dnaSize - start-trim;
        if (size > maxBlockSize)
            size = maxBlockSize;
        if (!fastFind(dna+start, size, ps, &left, &rc, &score) )
            {
            printf("Contig %d.%d:%d-%d of %d UNALIGNED\n",
                contig+1, subIx, start, start+size, dnaSize);
            }
        else
            {
            int x1, x2;
            int xo, w;
            double quality;
            int qStart, qSize, tStart,tSize;
            char qualityString[40];

            right = left;
            while (right->right != NULL)
                right = right->right;
            qStart = left->nStart - dna;
            qSize = right->nEnd - left->nStart;
	    if (rc)
		{
		int rcEnd = right->nEnd - (dna+start) - 1;
		qStart = reverseOffset(rcEnd, size) + start;
		}
            tStart = left->hStart - wholeDna;
            tSize = right->hEnd - left->hStart;
            quality = 100.0 * score / qSize;
            if (quality >= 25.0)
                sprintf(qualityString, "%4.1f%%", quality);
            else
                sprintf(qualityString, "<50%%");

            printf("<A HREF=\"../cgi-bin/chkGlue.exe?bacAcc=%s&contig=%d&qStart=%d&qSize=%d&tStart=%d&tSize=%d&repeatMask=%s\">",
                bacAcc, contig, qStart, qSize, tStart, tSize, repeatMask);

            printf("Contig %d.%d:%d-%d %c of %d aligned %d-%d of %d aliSize %d quality %s</A>\n",
                contig+1, subIx, qStart, qStart+qSize, 
                (rc ? '-' : '+'), dnaSize, 
                tStart, tStart + tSize,
                wholeSize,
                qSize, qualityString);
            x1 = roundingScale(trackWidth, left->hStart - wholeDna, wholeSize);
            x2 = roundingScale(trackWidth, right->hEnd - wholeDna, wholeSize);
            xo = x1+x;
            w = x2-x1;
            mapWriteBox(mapFile, mtBlock, xo, y, w, trackHeight,
                bacAcc, contig, qStart, qSize, tStart, tSize);
            mgDrawBox(mg, xo, y, w, trackHeight, color);
            mgTextCentered(mg, xo, y, w, trackHeight, MG_WHITE, font, numText);
            ffFreeAli(&left);
            }
        ++subIx;
        }
    }
freePatSpace(&ps);
freeAllSeq(&whole);
freeAllSeq(&partList);
}
int main(int argc, char *argv[])
{
char *genoListName;
char *otherListName;
char *oocFileName;
char *typeName;
char *outName;
struct patSpace *patSpace;
long startTime, endTime;
char **genoList;
int genoListSize;
char *genoListBuf;
char **otherList;
int otherListSize;
char *otherListBuf;
char *genoName;
int i;
int blockCount = 0;
struct dnaSeq **seqListList = NULL, *seq = NULL;
char *outRoot;
struct sqlConnection *conn;
enum ffStringency stringency = ffCdna;
int seedSize = 10;
FILE *out;
boolean noHead = FALSE;
struct repeatTracker *rt;
struct hash *repeatHash = newHash(10);

hostName = getenv("HOST");
pushWarnHandler(warnHandler);

startTime = clock1();
cgiSpoof(&argc, argv);
minMatch = cgiOptionalInt("minMatch", minMatch);
maxBad = cgiOptionalInt("maxBad", maxBad);
minBases = cgiOptionalInt("minBases", minBases);

dnaUtilOpen();

#ifdef DEBUG
/* Hard wire command line input so don't have to type it in each 
 * time run the stupid Gnu debugger. */

genoListName = "pFoo/geno.lst";
otherListName = "pFoo/bacend.lst";
typeName = "genomic";
oocFileName = "/d/biodata/human/10.ooc";
outName = "pFoo/pFoo.psl";

#else

if (argc != 6 && argc != 7)
    usage();

genoListName = argv[1];
otherListName = argv[2];
typeName = argv[3];
oocFileName = argv[4];
if (sameWord(oocFileName, "none"))
    oocFileName = NULL;
outName = argv[5];
if (argc == 7)
    {
    if (sameWord("noHead", argv[6]))
	noHead = TRUE;
    else
	usage();
    }

#endif 

if (sameWord(typeName, "mRNA") || sameWord(typeName, "cDNA"))
    {
    stringency = ffCdna;
    }
else if (sameWord(typeName, "genomic"))
    {
    stringency = ffTight;
    }
else if (sameWord(typeName, "g2g"))
    {
    stringency = ffTight;
    veryTight = TRUE;
    seedSize = 11;
    }
else if (sameString(typeName, "asm"))
    {
    stringency = ffTight;
    avoidSelfSelf = TRUE;
    }
else
    {
    warn("Unrecognized otherType %s\n", typeName);
    usage();
    }

readAllWordsOrFa(genoListName, &genoList, &genoListSize, &genoListBuf);
filterMissingFiles(genoList, &genoListSize);
if (genoListSize <= 0)
    errAbort("There are no files that exist in %s\n", genoListName);
readAllWordsOrFa(otherListName, &otherList, &otherListSize, &otherListBuf);
if (otherListSize <= 0)
    errAbort("There are no files that exist in %s\n", otherListName);
filterMissingFiles(otherList, &otherListSize);
out = mustOpen(outName, "w");
if (!noHead)
    pslWriteHead(out);

AllocArray(seqListList, genoListSize);
for (i=0; i<genoListSize; ++i)
    {
    genoName = genoList[i];
    if (!startsWith("#", genoName)  )
        seqListList[i] = seq = faReadAllDna(genoName);
    for (;seq != NULL; seq = seq->next)
	{
	int size = seq->size;
	char *name = seq->name;
	struct hashEl *hel;
	AllocVar(rt);
	AllocArray(rt->repBytes, size);
	rt->seq = seq;
	if ((hel = hashLookup(repeatHash, name)) != NULL)
	    errAbort("Duplicate %s in %s\n", name, genoName);
	hashAdd(repeatHash, name, rt);
	}
    storeMasked(repeatHash, genoName);
    }

patSpace = makePatSpace(seqListList, genoListSize, seedSize, oocFileName, minMatch, 2000);
endTime = clock1();
printf("Made index in %ld seconds\n",  (endTime-startTime));
startTime = endTime;

for (i=0; i<otherListSize; ++i)
    {
    FILE *f;
    char *otherName;
    int c;
    int dotCount = 0;
    struct dnaSeq otherSeq;
    ZeroVar(&otherSeq);

    otherName = otherList[i];
    if (startsWith("#", otherName)  )
	continue;
    f = mustOpen(otherName, "r");
    while ((c = fgetc(f)) != EOF)
	if (c == '>')
	    break;
    printf("%s\n", otherName);
    fflush(stdout);
    while (faFastReadNext(f, &otherSeq.dna, &otherSeq.size, &otherSeq.name))
        {
	aliSeqName = otherSeq.name;
	oneStrand(patSpace, repeatHash, &otherSeq, FALSE, stringency, out);
	reverseComplement(otherSeq.dna, otherSeq.size);
	oneStrand(patSpace, repeatHash, &otherSeq, TRUE, stringency, out);
	aliSeqName = NULL;
        }
    fclose(f);
    }
freePatSpace(&patSpace);
endTime = clock1();
printf("Alignment time is %ld sec\n", (endTime-startTime));
startTime = endTime;
fclose(out);
return 0;
}
Ejemplo n.º 3
0
int main(int argc, char *argv[])
{
    char *estName, *targetName, *oocName;
    FILE *estFile;
    struct dnaSeq *target;
    struct dnaSeq *est;
    struct patSpace *ps;
    struct patClump *clumpList, *clump;
    int estIx = 0;

    /* Check command line arguments and assign to local variables. */
    if (argc != 4)
        usage();
    estName = argv[1];
    estFile = mustOpen(estName, "rb");
    targetName = argv[2];
    oocName = argv[3];

    /* Read in target DNA from fasta files and check not too big. */
    fprintf(stderr, "Reading %s\n", targetName);
    target = faReadAllDna(targetName);
    if (totalSequenceSize(target) > 8000000)
    {
        errAbort("Can only handle 8000000 bases of genomic sequence at once, %s has %d.",
                 targetName, totalSequenceSize(target));
    }

    /* Make a pattern space index structure. */
    fprintf(stderr, "Making Pattern Space index\n");
    ps = makePatSpace(&target, 1, oocName, 4, 32000);

    /* Loop through each EST in query list. */
    printf("Searching for hits\n\n");
    while (faReadNext(estFile, NULL, TRUE, NULL, &est))
    {
        boolean isRc;   /* Reverse complemented? */

        if (++estIx % 5000 == 0)
            fprintf(stderr, "Processing EST %d\n", estIx);
        if (est->size > 20000)
        {
            warn("Very large EST sequence %s.\n"
                 "Maybe you mixed up the EST and genomic parameters?", est->name);
            usage();
        }

        for (isRc = 0; isRc <= 1; ++isRc)   /* Search both strands. */
        {
            if (isRc)
                reverseComplement(est->dna, est->size);
            clumpList = patSpaceFindOne(ps, est->dna, est->size);

            /* For each homology clump patSpace finds, do a fuzzyFinder
             * alignment of it and print the results. */
            for (clump = clumpList; clump != NULL; clump = clump->next)
            {
                struct ffAli *ali, *a;
                boolean isRc;
                int score;
                struct dnaSeq *t = clump->seq;
                DNA *tStart = t->dna + clump->start;

                ali = ffFind(est->dna, est->dna+est->size, tStart, tStart + clump->size, ffCdna);
                if (ali != NULL)
                {
                    score = ffScoreCdna(ali);
                    printf("%s hits %s strand %c score %d\n",
                           est->name, t->name, (isRc ? '+' : '-'), score);
                    for (a = ali; a != NULL; a = a->right)
                    {
                        printf("  Q %4d - %4d\t T %4d -%4d\n",
                               a->nStart - est->dna, a->nEnd - est->dna,
                               a->hStart - t->dna, a->hEnd - t->dna);
                    }
                    printf("\n");
                    ffFreeAli(&ali);
                }
                else
                {
                    printf("Couldn't align clump at %s %d-%d\n",
                           t->name, clump->start, clump->start + clump->size);
                }
            }
            slFreeList(&clumpList);
        }
        freeDnaSeq(&est);
    }
    /* Clean up time. */
    freePatSpace(&ps);
    freeSeqList(&target);
    return 0;
}