void outputChunk(struct psl **pPslList, char *tempDir, int midIx, boolean noHead)
/* Sort and write out pslList and free it. */
{
char fileName[512];
FILE *f;
struct psl *psl;

if (*pPslList == NULL)
    return; 	/* Empty. */
psl = *pPslList;
//slSort(pPslList, pslCmpTarget);
makeMidName(tempDir, midIx, fileName);
if (stripVer)
    {
    char *s = stringIn(".",psl->qName);
    if (s != NULL)
        *s = 0;
    }
if (chunkSize ==1)
    safef(fileName, sizeof(fileName), "%s/%s.psl",tempDir,psl->qName);
f = mustOpen(fileName, "w");
if (!noHead)
    pslWriteHead(f);
for (psl = *pPslList; psl != NULL; psl = psl->next)
    pslTabOut(psl, f);
fclose(f);
pslFreeList(pPslList);
}
Beispiel #2
0
void pslCut(char *cutList, char *inPsl, char *outPsl)
/* pslCut - Remove a list of clones from psl file.. */
{
struct hash *cutHash = newHash(0);
struct lineFile *lf = pslFileOpen(inPsl);
FILE *f = mustOpen(outPsl, "w");
struct psl *psl;
char cloneName[128];
int total = 0, cut = 0;

buildCutHash(cutList, cutHash);
pslWriteHead(f);
while ((psl = pslNext(lf)) != NULL)
    {
    fragToCloneName(psl->tName, cloneName);
    if (!hashLookup(cutHash, cloneName))
	{
        pslTabOut(psl, f);
	}
    else
        ++cut;
    ++total;
    pslFree(&psl);
    }
printf("Cut %d of %d\n", cut, total);
}
void pslCopyInClones(char *listFile, char *partDir, char *outName)
/* Copy in the .psl files corresponding to the clones named in listFile. */
{
struct slName *inList, *inEl;
FILE *out = mustOpen(outName, "w");
struct psl *psl;
int pslCount = 0;
int fileCount = 0;

pslWriteHead(out);
inList = getFileList(listFile, partDir);
for (inEl = inList; inEl != NULL; inEl = inEl->next)
    {
    char *inName = inEl->name;
    struct lineFile *lf = pslFileOpen(inName);
    ++fileCount;
    while ((psl = pslNext(lf)) != NULL)
	{
	pslTabOut(psl, out);
	pslFree(&psl);
	++pslCount;
	}
    lineFileClose(&lf);
    }
printf("%d psls in %d files written to %s\n", pslCount, fileCount, outName);
fclose(out);
}
Beispiel #4
0
void fastaToPsl(char *inName, char *outName)
/* fastaToPsl - Convert axt to psl format. */
{
struct lineFile *inLF;
FILE *outFh;
boolean read;
struct psl* pslAlign;

DNA *qSeq;
int qSize;
int qSeqLen;
char *qHeader;

DNA *tSeq;
int tSize;
int tSeqLen;
char *tHeader;

int queryCounter;

inLF  = lineFileOpen(inName, TRUE);
outFh = mustOpen(outName, "w");

/* read the target sequence */
read = faMixedSpeedReadNext(inLF, &qSeq, &qSize, &qHeader);
if (!read)
    errAbort("Could not read target FASTA entry.");
qSeq    = cloneString(qSeq);
qSeqLen = countNonDash(qSeq, qSize);
qHeader = cloneString(qHeader);
verbose(2, "Query sequence header: %s\n", qHeader);
verbose(3, "Query sequence alignment length: %d\n", qSize);
verbose(3, "Query sequence length: %d\n", qSeqLen);
verbose(4, "Query sequence: %s\n", qSeq);

/* read the rest of the sequences */
queryCounter = 1;
pslWriteHead(outFh);
while (faMixedSpeedReadNext(inLF, &tSeq, &tSize, &tHeader))
    {
    tSeqLen = countNonDash(tSeq, tSize);

    verbose(2, "Target sequence (%d) header: %s\n", queryCounter, tHeader);
    verbose(3, "Target sequence (%d) length: %d\n", queryCounter, tSeqLen);
    verbose(4, "Target sequence (%d): %s\n", queryCounter, tSeq);

    pslAlign = pslFromAlign(qHeader, qSeqLen, 0, qSeqLen, qSeq,
                            tHeader, tSeqLen, 0, tSeqLen, tSeq,
                            "+", 0);
    pslTabOut(pslAlign, outFh);

    ++queryCounter;
    }

lineFileClose(&inLF);
}
void pslGlue(char *inNames[], int inCount, char *outName, char *glueName)
/* Reduce a psl file to only the gluing components. */
{
FILE *out;
FILE *glue;
struct psl *pslList = NULL, *psl, *nextPsl;
int i;
struct psl *localList = NULL;
int glueCount = 0;

int pslCount = 0;

printf("Reading");
for (i=0; i<inCount; ++i)
    {
    char *inName = inNames[i];
    struct lineFile *lf = pslFileOpen(inName);
    printf(" %s", inName);
    fflush(stdout);
    while ((psl = pslNext(lf)) != NULL)
        {
        slAddHead(&pslList, psl);
        ++pslCount;
        }
    lineFileClose(&lf);
    }
printf("\n");
slSort(&pslList, pslCmpQuery);

out = mustOpen(outName, "w");
glue = mustOpen(glueName, "w");
pslWriteHead(out);

/* Chop this up into chunks that share the same query. */
for (psl = pslList; psl != NULL; psl = nextPsl)
    {
    nextPsl = psl->next;
    if (localList != NULL)
        {
        if (!sameString(localList->qName, psl->qName))
            {
            glueCount += simpleOut(out, glue, &localList);
            localList = NULL;
            }
        }
    slAddHead(&localList, psl);
    }
glueCount += simpleOut(out, glue, &localList);
printf("Got %d gluing mRNAs out of %d psls in %d bundles %d ltot %d mtot\n",
        glueCount, pslCount, outCount, ltot, mtot);
fclose(out);
fclose(glue);
}
Beispiel #6
0
void pslReps(char *inName, char *bestAliName, char *repName)
/* Analyse inName and put best alignments for eacmRNA in estAliName.
 * Put repeat info in repName. */
{
struct lineFile *in = pslFileOpen(inName);
FILE *bestFile = mustOpen(bestAliName, "w");
FILE *repFile = mustOpen(repName, "w");
int lineSize;
char *line;
char *words[32];
int wordCount;
struct psl *pslList = NULL, *psl = NULL;
char lastName[512];
int aliCount = 0;
quiet = sameString(bestAliName, "stdout") || sameString(repName, "stdout");
if (coverQSizeFile != NULL)
    loadCoverQSizes(coverQSizeFile);

if (!quiet)
    printf("Processing %s to %s and %s\n", inName, bestAliName, repName);
 if (!noHead)
     pslWriteHead(bestFile);
strcpy(lastName, "");
while (lineFileNext(in, &line, &lineSize))
    {
    if (((++aliCount & 0x1ffff) == 0) && !quiet)
        {
	printf(".");
	fflush(stdout);
	}
    wordCount = chopTabs(line, words);
    if (wordCount == 21)
	psl = pslLoad(words);
    else if (wordCount == 23)
	psl = pslxLoad(words);
    else
	errAbort("Bad line %d of %s\n", in->lineIx, in->fileName);
    if (!sameString(lastName, psl->qName))
	{
	doOneAcc(lastName, pslList, bestFile, repFile);
	pslFreeList(&pslList);
	safef(lastName, sizeof(lastName), "%s", psl->qName);
	}
    slAddHead(&pslList, psl);
    }
doOneAcc(lastName, pslList, bestFile, repFile);
pslFreeList(&pslList);
lineFileClose(&in);
fclose(bestFile);
fclose(repFile);
if (!quiet)
    printf("Processed %d alignments\n", aliCount);
}
void pslGlueRna(char *listFile, char *partDir, char *pslName, char *gluName)
/* Reduce a psl files for only the gluing mRNA/EST components. */
{
FILE *pslOut;
FILE *gluOut;
struct psl *pslList = NULL, *psl, *nextPsl;
struct psl *localList = NULL;
int glueCount = 0;
int pslCount = 0;
struct slName *inList, *inEl;

inList = getFileList(listFile, partDir);
for (inEl = inList; inEl != NULL; inEl = inEl->next)
    {
    char *inName = inEl->name;
    struct lineFile *lf = pslFileOpen(inName);
    while ((psl = pslNext(lf)) != NULL)
	{
	slAddHead(&pslList, psl);
	++pslCount;
	}
    lineFileClose(&lf);
    }
slSort(&pslList, pslCmpQuery);

pslOut = mustOpen(pslName, "w");
gluOut = mustOpen(gluName, "w");
pslWriteHead(pslOut);

/* Chop this up into chunks that share the same query. */
for (psl = pslList; psl != NULL; psl = nextPsl)
    {
    nextPsl = psl->next;
    if (localList != NULL)
	{
	if (!sameString(localList->qName, psl->qName))
	    {
	    glueCount += output(pslOut, gluOut, &localList);
	    localList = NULL;
	    }
	}
    slAddHead(&localList, psl);
    }
glueCount += output(pslOut, gluOut, &localList);
printf("Got %d gluing mRNAs out of %d psls in %d bundles %d ltot %d mtot to %s\n", 
	glueCount, pslCount, outCount, ltot, mtot, gluName);
fclose(pslOut);
fclose(gluOut);
}
Beispiel #8
0
void outputChunk(struct psl **pPslList, char *tempDir, int midIx)
/* Sort and write out pslList and free it. */
{
char fileName[512];
FILE *f;
struct psl *psl;

if (*pPslList == NULL)
    return; 	/* Empty. */
slSort(pPslList, pslCmpTarget);
makeMidName(tempDir, midIx, fileName);
f = mustOpen(fileName, "w");
pslWriteHead(f);
for (psl = *pPslList; psl != NULL; psl = psl->next)
    pslTabOut(psl, f);
fclose(f);
pslFreeList(pPslList);
}
int main(int argc, char *argv[])
{
char *genoListName;
char *otherListName;
char *oocFileName;
char *typeName;
char *outName;
struct patSpace *patSpace;
long startTime, endTime;
char **genoList;
int genoListSize;
char *genoListBuf;
char **otherList;
int otherListSize;
char *otherListBuf;
char *genoName;
int i;
int blockCount = 0;
struct dnaSeq **seqListList = NULL, *seq = NULL;
char *outRoot;
struct sqlConnection *conn;
enum ffStringency stringency = ffCdna;
int seedSize = 10;
FILE *out;
boolean noHead = FALSE;
struct repeatTracker *rt;
struct hash *repeatHash = newHash(10);

hostName = getenv("HOST");
pushWarnHandler(warnHandler);

startTime = clock1();
cgiSpoof(&argc, argv);
minMatch = cgiOptionalInt("minMatch", minMatch);
maxBad = cgiOptionalInt("maxBad", maxBad);
minBases = cgiOptionalInt("minBases", minBases);

dnaUtilOpen();

#ifdef DEBUG
/* Hard wire command line input so don't have to type it in each 
 * time run the stupid Gnu debugger. */

genoListName = "pFoo/geno.lst";
otherListName = "pFoo/bacend.lst";
typeName = "genomic";
oocFileName = "/d/biodata/human/10.ooc";
outName = "pFoo/pFoo.psl";

#else

if (argc != 6 && argc != 7)
    usage();

genoListName = argv[1];
otherListName = argv[2];
typeName = argv[3];
oocFileName = argv[4];
if (sameWord(oocFileName, "none"))
    oocFileName = NULL;
outName = argv[5];
if (argc == 7)
    {
    if (sameWord("noHead", argv[6]))
	noHead = TRUE;
    else
	usage();
    }

#endif 

if (sameWord(typeName, "mRNA") || sameWord(typeName, "cDNA"))
    {
    stringency = ffCdna;
    }
else if (sameWord(typeName, "genomic"))
    {
    stringency = ffTight;
    }
else if (sameWord(typeName, "g2g"))
    {
    stringency = ffTight;
    veryTight = TRUE;
    seedSize = 11;
    }
else if (sameString(typeName, "asm"))
    {
    stringency = ffTight;
    avoidSelfSelf = TRUE;
    }
else
    {
    warn("Unrecognized otherType %s\n", typeName);
    usage();
    }

readAllWordsOrFa(genoListName, &genoList, &genoListSize, &genoListBuf);
filterMissingFiles(genoList, &genoListSize);
if (genoListSize <= 0)
    errAbort("There are no files that exist in %s\n", genoListName);
readAllWordsOrFa(otherListName, &otherList, &otherListSize, &otherListBuf);
if (otherListSize <= 0)
    errAbort("There are no files that exist in %s\n", otherListName);
filterMissingFiles(otherList, &otherListSize);
out = mustOpen(outName, "w");
if (!noHead)
    pslWriteHead(out);

AllocArray(seqListList, genoListSize);
for (i=0; i<genoListSize; ++i)
    {
    genoName = genoList[i];
    if (!startsWith("#", genoName)  )
        seqListList[i] = seq = faReadAllDna(genoName);
    for (;seq != NULL; seq = seq->next)
	{
	int size = seq->size;
	char *name = seq->name;
	struct hashEl *hel;
	AllocVar(rt);
	AllocArray(rt->repBytes, size);
	rt->seq = seq;
	if ((hel = hashLookup(repeatHash, name)) != NULL)
	    errAbort("Duplicate %s in %s\n", name, genoName);
	hashAdd(repeatHash, name, rt);
	}
    storeMasked(repeatHash, genoName);
    }

patSpace = makePatSpace(seqListList, genoListSize, seedSize, oocFileName, minMatch, 2000);
endTime = clock1();
printf("Made index in %ld seconds\n",  (endTime-startTime));
startTime = endTime;

for (i=0; i<otherListSize; ++i)
    {
    FILE *f;
    char *otherName;
    int c;
    int dotCount = 0;
    struct dnaSeq otherSeq;
    ZeroVar(&otherSeq);

    otherName = otherList[i];
    if (startsWith("#", otherName)  )
	continue;
    f = mustOpen(otherName, "r");
    while ((c = fgetc(f)) != EOF)
	if (c == '>')
	    break;
    printf("%s\n", otherName);
    fflush(stdout);
    while (faFastReadNext(f, &otherSeq.dna, &otherSeq.size, &otherSeq.name))
        {
	aliSeqName = otherSeq.name;
	oneStrand(patSpace, repeatHash, &otherSeq, FALSE, stringency, out);
	reverseComplement(otherSeq.dna, otherSeq.size);
	oneStrand(patSpace, repeatHash, &otherSeq, TRUE, stringency, out);
	aliSeqName = NULL;
        }
    fclose(f);
    }
freePatSpace(&patSpace);
endTime = clock1();
printf("Alignment time is %ld sec\n", (endTime-startTime));
startTime = endTime;
fclose(out);
return 0;
}
void liftPsl(char *destFile, struct hash *liftHash, int sourceCount, char *sources[],
	boolean querySide, boolean isExtended)
/* Lift up coordinates in .psl file. */
{
FILE *dest = mustOpen(destFile, "w");
char *source;
int i,j;
struct lineFile *lf;
struct psl *psl;
struct xAli *xa = NULL;
unsigned *starts;
unsigned *blockSizes;
struct liftSpec *spec;
int offset;
int blockCount;
char *seqName;
int dotMod = dots;
int seqSize;
int strandChar = (querySide ? 0 : 1);

if (!nohead)
    pslWriteHead(dest);
for (i=0; i<sourceCount; ++i)
    {
    source = sources[i];
    if (!fileExists(source))
	{
	warn("%s doesn't exist!", source);
	continue;
	}
    verbose(1, "Lifting %s\n", source);
    lf = pslFileOpenWithMeta(source, dest);
    for (;;)
        {
	if (isExtended)
	    {
	    xa = xAliNext(lf);
	    psl = (struct psl *)xa;
	    }
	else
	    psl = pslNext(lf);
	if (psl == NULL)
	    break;
	boolean isProt = pslIsProtein(psl);

	doDots(&dotMod);
	if (querySide)
	    seqName = psl->qName;
	else
	    seqName = psl->tName;
	spec = findLift(liftHash, seqName, lf);
	if (spec == NULL)
	    {
	    if (how != carryMissing)
	        {
		freePslOrXa(psl, isExtended);
		continue;
		}
	    }
	else
	    {
	    offset = spec->offset;
	    blockSizes = psl->blockSizes;
	    if (querySide)
	        {
		if (!isPtoG)
		    {
		    cantHandleSpecRevStrand(spec);
		    psl->qStart += offset;
		    psl->qEnd += offset;
		    }
		else
		    {
		    psl->match *= 3;
		    psl->misMatch *= 3;
		    if (spec->strand == '-')
			{
			int tmp = psl->qEnd;
			psl->qEnd = psl->qStart;
			psl->qStart = tmp;
			psl->qStart *= -3;
			psl->qEnd *= -3;
			psl->qStart += offset;
			psl->qEnd += offset;
			}
		    else if (spec->strand == '+')
			{
			psl->qStart *= 3;
			psl->qStart += offset;
			psl->qEnd *= 3;
			psl->qEnd += offset;
			}
		    }
		starts = psl->qStarts;
		seqSize = psl->qSize;
		}
	    else
	        {
		if (spec->strand == '-')
		    reverseIntRange(&psl->tStart, &psl->tEnd, psl->tSize);
		psl->tStart += offset;
		psl->tEnd += offset;
		starts = psl->tStarts;
		seqSize = psl->tSize;
		}
	    blockCount = psl->blockCount;
	    if (isPtoG && (spec->strand == '-'))
	        {
		psl->strand[strandChar] = spec->strand;
		for (j=0; j<blockCount; ++j)
		    {
		    starts[j] *= -3;
		    starts[j] += offset;
		    starts[j] = spec->newSize - starts[j];
		    }
		}
	    else if (isPtoG && (spec->strand == '+'))
	        {
		psl->strand[strandChar] = spec->strand;
		for (j=0; j<blockCount; ++j)
		    {
		    starts[j] *= 3;
		    starts[j] += offset;
		    }
		}
	    else /* mRNA case. */
		{
		if (spec->strand == '+')
		    {
		    if (psl->strand[strandChar] == '-')
			{
			for (j=0; j<blockCount; ++j)
			    {
			    int tr = seqSize - starts[j];
			    tr += offset;
			    starts[j] = spec->newSize - tr;
			    }
			}
		    else
			{
			for (j=0; j<blockCount; ++j)
			    starts[j] += offset;
			}
		    }
		else
		    {
		    if (isProt)
			{
			/* if it's protein, we can't reverse the query */
			if (psl->strand[strandChar] == '-')
			    {
			    for (j=0; j<blockCount; ++j)
				starts[j] += offset;
			    }
			else
			    {
			    for (j=0; j<blockCount; ++j)
				{
				int tr = seqSize - starts[j];
				tr += offset;
				starts[j] = spec->newSize - tr;
				}
			    }
			psl->strand[strandChar] = 
			    flipStrand(psl->strand[strandChar]);
			}
		    else
			{
			if (psl->strand[strandChar] == '-')
			     errAbort("Can't handle all these minus strands! line %d",lf->lineIx);
			else
			    {
			    for (j=0; j<blockCount; ++j)
				{
				psl->tStarts[j] = psl->tSize - 
				    (psl->tStarts[j] + blockSizes[j]) + offset;
				psl->qStarts[j] = psl->qSize - 
				    (psl->qStarts[j] + blockSizes[j]);	/* no offset. */
				}
			    psl->strand[1-strandChar] = 
				flipStrand(psl->strand[1-strandChar]);
			    reverseUnsigned(blockSizes, blockCount);
			    reverseUnsigned(psl->qStarts, blockCount);
			    reverseUnsigned(psl->tStarts, blockCount);
			    }
			}
		    }
		}

	    if (isPtoG)
		for (j=0; j<blockCount; ++j)
		    blockSizes[j] *= 3;
	    if (querySide)
	        {
		psl->qSize = spec->newSize;
		psl->qName = spec->newName;
		}
	    else
	        {
		psl->tSize = spec->newSize;
		psl->tName = spec->newName;
		}
	    }
	if (isExtended)
	    {
	    xAliTabOut(xa, dest);
	    }
	else
	    {
	    pslTabOut(psl, dest);
	    }
	if (querySide)
	    psl->qName = seqName;
	else
	    psl->tName = seqName;
	freePslOrXa(psl, isExtended);
	}
    lineFileClose(&lf);
    if (dots)
        verbose(1, "\n");
    }
if (ferror(dest))
    errAbort("error writing %s", destFile);
fclose(dest);
}
Beispiel #11
0
static void pslHead(struct gfOutput *out, FILE *f)
/* Write out psl head */
{
    pslWriteHead(f);
}
Beispiel #12
0
int main(int argc, char *argv[]) {
   /*
    * Arguments/options
    */
   char outputFile[50];
   char inputFile[50];
   char query[100];
   char target[100];

   ///////////////////////////////////////////////////////////////////////////
   // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
   ///////////////////////////////////////////////////////////////////////////

   while(1) {
      static struct option long_options[] = {
         { "query", required_argument, 0, 'q' },
         { "target", required_argument, 0, 't' },
         { "outputFile", required_argument, 0, 'o' },
         { "inputFile", required_argument, 0, 'i' },
         { "help", no_argument, 0, 'h' },
         { 0, 0, 0, 0 }
      };

      int option_index = 0;

      int key = getopt_long(argc, argv, "i:o:q:t:h", long_options, &option_index);

      if(key == -1) {
         break;
      }

      switch(key) {
         case 'i':
            strcpy(inputFile, optarg);
            break;
         case 'o':
            strcpy(outputFile, optarg);
            break;
         case 'q':
            strcpy(query, optarg);
            break;
         case 't':
            strcpy(target, optarg);
            break;
         case 'h':
            usage();
            return 0;
         default:
            usage();
            return 1;
      }
   }

   ///////////////////////////////////////////////////////////////////////////
   // (0) Check the inputs.
   ///////////////////////////////////////////////////////////////////////////

   assert(outputFile != NULL);
   assert(query != NULL);
   assert(target != NULL);

   FILE *fileHandle = fopen(outputFile, "w");
   pslWriteHead(fileHandle);
   struct psl *pslList = pslLoadAll(inputFile);
   mapPSLs(pslList, fileHandle, query, target);
   fclose(fileHandle);
   
   return 0;
}
void pslSort2(char *outDir, char *tempDir, boolean noHead)
/* Do second step of sort - merge all sorted files in tempDir
 * to final outdir. */
{
char fileName[512];
struct slName *tmpList, *tmp;
struct midFile *midList = NULL, *mid;
int aliCount = 0;
FILE *f = NULL;
char lastTargetAcc[256];
char targetAcc[256];


strcpy(lastTargetAcc, "");
tmpList = listDir(tempDir, "tmp*.psl");
if (tmpList == NULL)
    errAbort("No tmp*.psl files in %s\n", tempDir);
for (tmp = tmpList; tmp != NULL; tmp = tmp->next)
    {
    sprintf(fileName, "%s/%s", tempDir, tmp->name);
    AllocVar(mid);
    mid->lf = pslFileOpen(fileName);
    slAddHead(&midList, mid);
    }
printf("writing %s", outDir);
fflush(stdout);
/* Write out the lowest sorting line from mid list until done. */
for (;;)
    {
    struct midFile *bestMid = NULL;
    if ( (++aliCount & 0xffff) == 0)
	{
	printf(".");
	fflush(stdout);
	}
    for (mid = midList; mid != NULL; mid = mid->next)
	{
	if (mid->lf != NULL && mid->psl == NULL)
	    {
	    if ((mid->psl = nextPsl(mid->lf)) == NULL)
		lineFileClose(&mid->lf);
	    }
	if (mid->psl != NULL)
	    {
	    if (bestMid == NULL || pslCmpTarget(&mid->psl, &bestMid->psl) < 0)
		bestMid = mid;
	    }
	}
    if (bestMid == NULL)
	break;
    getTargetAcc(bestMid->psl->tName, targetAcc);
    if (!sameString(targetAcc, lastTargetAcc))
	{
	strcpy(lastTargetAcc, targetAcc);
	carefulClose(&f);
	sprintf(fileName, "%s/%s.psl", outDir, targetAcc);
	f = mustOpen(fileName, "w");
	if (!noHead)
	    pslWriteHead(f);
	}
    pslTabOut(bestMid->psl, f);
    pslFree(&bestMid->psl);
    }
carefulClose(&f);
printf("\n");

printf("Cleaning up temp files\n");
for (tmp = tmpList; tmp != NULL; tmp = tmp->next)
    {
    sprintf(fileName, "%s/%s", tempDir, tmp->name);
    remove(fileName);
    }
}
Beispiel #14
0
void pslSort(char *command, char *outFile, char *tempDir, char *inDirs[], int inDirCount)
/* Do the two step sort. */
{
int i;
struct slName *fileList = NULL, *name;
char *inDir;
struct slName *dirDir, *dirFile;
char fileName[512];
int fileCount;
int totalFilesProcessed = 0;
int filesPerMidFile;
int midFileCount = 0;
FILE *f;
struct lineFile *lf;
boolean doReflect = FALSE;
boolean suppressSelf = FALSE;
boolean firstOnly = endsWith(command, "1");
boolean secondOnly = endsWith(command, "2");

if (startsWith("dirs", command))
    ;
else if (startsWith("g2g", command))
    {
    doReflect = TRUE;
    suppressSelf = TRUE;
    }
else
    usage();


if (!secondOnly)
    {
    makeDir(tempDir);
    /* Figure out how many files to process. */
    for (i=0; i<inDirCount; ++i)
	{
	inDir = inDirs[i];
	dirDir = listDir(inDir, "*.psl");
	if (slCount(dirDir) == 0)
	    dirDir = listDir(inDir, "*.psl.gz");
	if (slCount(dirDir) == 0)
	    errAbort("No psl files in %s\n", inDir);
	verbose(1, "%s with %d files\n", inDir, slCount(dirDir));
	for (dirFile = dirDir; dirFile != NULL; dirFile = dirFile->next)
	    {
	    sprintf(fileName, "%s/%s", inDir, dirFile->name);
	    name = newSlName(fileName);
	    slAddHead(&fileList, name);
	    }
	slFreeList(&dirDir);
	}
    verbose(1, "%d files in %d dirs\n", slCount(fileList), inDirCount);
    slReverse(&fileList);
    fileCount = slCount(fileList);
    filesPerMidFile = round(sqrt(fileCount));
    // if (filesPerMidFile > 20)
	// filesPerMidFile = 20;  /* bandaide! Should keep track of mem usage. */
    verbose(1, "Got %d files %d files per mid file\n", fileCount, filesPerMidFile);

    /* Read in files a group at a time, sort, and write merged, sorted
     * output of one group. */
    name = fileList;
    while (totalFilesProcessed < fileCount)
	{
	int filesInMidFile = 0;
	struct psl *pslList = NULL, *psl;
	int lfileCount = 0;
	struct lm *lm = lmInit(256*1024);

	for (filesInMidFile = 0; filesInMidFile < filesPerMidFile && name != NULL;
	    ++filesInMidFile, ++totalFilesProcessed, name = name->next)
	    {
	    boolean reflectMe = FALSE;
	    if (doReflect)
		{
		reflectMe = !selfFile(name->name);
		}
	    verbose(2, "Reading %s (%d of %d)\n", name->name, totalFilesProcessed+1, fileCount);
	    lf = pslFileOpen(name->name);
	    while ((psl = nextLmPsl(lf, lm)) != NULL)
		{
		if (psl->qStart == psl->tStart && psl->strand[0] == '+' && 
		    suppressSelf && sameString(psl->qName, psl->tName))
		    {
		    continue;
		    }
		++lfileCount;
		slAddHead(&pslList, psl);
		if (reflectMe)
		    {
		    psl = mirrorLmPsl(psl, lm);
		    slAddHead(&pslList, psl);
		    }
		}
	    lineFileClose(&lf);
	    }
	slSort(&pslList, pslCmpQuery);
	makeMidName(tempDir, midFileCount, fileName);
	verbose(1, "Writing %s\n", fileName);
	f = mustOpen(fileName, "w");
	if (!nohead)
	    pslWriteHead(f);
	for (psl = pslList; psl != NULL; psl = psl->next)
	    {
	    pslTabOut(psl, f);
	    }
	fclose(f);
	pslList = NULL;
	lmCleanup(&lm);
	verbose(2, "lfileCount %d\n", lfileCount);
	++midFileCount;
	}
    }
if (!firstOnly)
    pslSort2(outFile, tempDir);
}
Beispiel #15
0
void pslSort2(char *outFile, char *tempDir)
/* Do second step of sort - merge all sorted files in tempDir
 * to final. */
{
char fileName[512];
struct slName *tmpList, *tmp;
struct midFile *midList = NULL, *mid;
int aliCount = 0;
FILE *f = mustOpen(outFile, "w");


if (!nohead)
    pslWriteHead(f);
tmpList = listDir(tempDir, "tmp*.psl");
if (tmpList == NULL)
    errAbort("No tmp*.psl files in %s\n", tempDir);
for (tmp = tmpList; tmp != NULL; tmp = tmp->next)
    {
    sprintf(fileName, "%s/%s", tempDir, tmp->name);
    AllocVar(mid);
    mid->lf = pslFileOpen(fileName);
    slAddHead(&midList, mid);
    }
verbose(1, "writing %s", outFile);
fflush(stdout);
/* Write out the lowest sorting line from mid list until done. */
for (;;)
    {
    struct midFile *bestMid = NULL;
    if ( (++aliCount & 0xffff) == 0)
	{
	verboseDot();
	fflush(stdout);
	}
    for (mid = midList; mid != NULL; mid = mid->next)
	{
	if (mid->lf != NULL && mid->psl == NULL)
	    {
	    if ((mid->psl = nextPsl(mid->lf)) == NULL)
		lineFileClose(&mid->lf);
	    }
	if (mid->psl != NULL)
	    {
	    if (bestMid == NULL || pslCmpQuery(&mid->psl, &bestMid->psl) < 0)
		bestMid = mid;
	    }
	}
    if (bestMid == NULL)
	break;
    pslTabOut(bestMid->psl, f);
    pslFree(&bestMid->psl);
    }
printf("\n");
fclose(f);

/* The followint really shouldn't be necessary.... */
for (mid = midList; mid != NULL; mid = mid->next)
    lineFileClose(&mid->lf);

printf("Cleaning up temp files\n");
for (tmp = tmpList; tmp != NULL; tmp = tmp->next)
    {
    sprintf(fileName, "%s/%s", tempDir, tmp->name);
    remove(fileName);
    }
}