Esempio n. 1
0
void pslSortAcc(char *command, char *outDir, char *tempDir, char *inFiles[], int inFileCount)
/* Do the two step sort. */
{
int chunkSize = 250000;	/* Do this many lines at once. */
int linesLeftInChunk = chunkSize;
int i;
char *inFile;
int totalLineCount = 0;
int midFileCount = 0;
struct lineFile *lf;
struct psl *psl, *pslList = NULL;
boolean noHead = (sameWord(command, "nohead"));

mkdir(outDir, 0775);
mkdir(tempDir, 0775);

/* Read in input and scatter it into sorted
 * temporary files. */
for (i = 0; i<inFileCount; ++i)
    {
    inFile = inFiles[i];
    printf("Processing %s", inFile);
    fflush(stdout);
    lf = pslFileOpen(inFile);
    while ((psl = nextPsl(lf)) != NULL)
	{
	slAddHead(&pslList, psl);
	if (--linesLeftInChunk <= 0)
	    {
	    outputChunk(&pslList, tempDir, midFileCount++);
	    linesLeftInChunk = chunkSize;
	    }
	if ((++totalLineCount & 0xffff) == 0)
	    {
	    printf(".");
	    fflush(stdout);
	    }
	}
    printf("\n");
    lineFileClose(&lf);
    }
outputChunk(&pslList, tempDir, midFileCount++);
printf("Processed %d lines into %d temp files\n", totalLineCount, midFileCount);
pslSort2(outDir, tempDir, noHead);
}
void pslSort2(char *outDir, char *tempDir, boolean noHead)
/* Do second step of sort - merge all sorted files in tempDir
 * to final outdir. */
{
char fileName[512];
struct slName *tmpList, *tmp;
struct midFile *midList = NULL, *mid;
int aliCount = 0;
FILE *f = NULL;
char lastTargetAcc[256];
char targetAcc[256];


strcpy(lastTargetAcc, "");
tmpList = listDir(tempDir, "tmp*.psl");
if (tmpList == NULL)
    errAbort("No tmp*.psl files in %s\n", tempDir);
for (tmp = tmpList; tmp != NULL; tmp = tmp->next)
    {
    sprintf(fileName, "%s/%s", tempDir, tmp->name);
    AllocVar(mid);
    mid->lf = pslFileOpen(fileName);
    slAddHead(&midList, mid);
    }
printf("writing %s", outDir);
fflush(stdout);
/* Write out the lowest sorting line from mid list until done. */
for (;;)
    {
    struct midFile *bestMid = NULL;
    if ( (++aliCount & 0xffff) == 0)
	{
	printf(".");
	fflush(stdout);
	}
    for (mid = midList; mid != NULL; mid = mid->next)
	{
	if (mid->lf != NULL && mid->psl == NULL)
	    {
	    if ((mid->psl = nextPsl(mid->lf)) == NULL)
		lineFileClose(&mid->lf);
	    }
	if (mid->psl != NULL)
	    {
	    if (bestMid == NULL || pslCmpTarget(&mid->psl, &bestMid->psl) < 0)
		bestMid = mid;
	    }
	}
    if (bestMid == NULL)
	break;
    getTargetAcc(bestMid->psl->tName, targetAcc);
    if (!sameString(targetAcc, lastTargetAcc))
	{
	strcpy(lastTargetAcc, targetAcc);
	carefulClose(&f);
	sprintf(fileName, "%s/%s.psl", outDir, targetAcc);
	f = mustOpen(fileName, "w");
	if (!noHead)
	    pslWriteHead(f);
	}
    pslTabOut(bestMid->psl, f);
    pslFree(&bestMid->psl);
    }
carefulClose(&f);
printf("\n");

printf("Cleaning up temp files\n");
for (tmp = tmpList; tmp != NULL; tmp = tmp->next)
    {
    sprintf(fileName, "%s/%s", tempDir, tmp->name);
    remove(fileName);
    }
}
void pslSplit(char *command, char *outDir,  char *inFiles[], int inFileCount)
/* pslSplit - "pslSplit - split into multiple output files by qName.*/
{
int linesLeftInChunk = chunkSize;
int i;
char *inFile;
char fileName[512];
int fileCount;
int totalLineCount = 0;
int midFileCount = 0;
FILE *f;
struct lineFile *lf;
char *line;
char *prev = cloneString("first");
int lineSize;
struct psl *psl, *pslList = NULL;
boolean noHead = (sameWord(command, "nohead"));

mkdir(outDir, 0775);

/* Read in presorted input and scatter it into sorted
 * temporary files. */
for (i = 0; i<inFileCount; ++i)
    {
    int linesLeft = maxLines;
    bool breakNext = FALSE;
    //char name[512];
    inFile = inFiles[i];
    printf("Processing %s", inFile);
    fflush(stdout);
    lf = pslFileOpen(inFile);
    psl = nextPsl(lf) ;
    prev = cloneString(psl->qName);
    slAddHead(&pslList, psl);
    while ((psl = nextPsl(lf)) != NULL)
	{
        //safef(name, sizeof(name), "%s",psl->qName);
        //chopSuffix(name);
        if (!sameString(prev, psl->qName))
            {
            prev = cloneString(psl->qName);
            if (--linesLeftInChunk <= 0 || breakNext)
                {
                outputChunk(&pslList, outDir, midFileCount++, noHead);
                linesLeftInChunk = chunkSize;
                linesLeft = maxLines;
                breakNext = FALSE;
                }
            }
        if (--linesLeft < 0)
            {
            breakNext = TRUE;
            }
	if ((++totalLineCount & 0xffff) == 0)
	    {
	    printf(".");
	    fflush(stdout);
	    }
	slAddHead(&pslList, psl);
        //freeMem(&prev);
	}
    printf("\n");
    lineFileClose(&lf);
    }
outputChunk(&pslList, outDir, midFileCount++, noHead);
printf("Processed %d lines into %d output files\n", totalLineCount, midFileCount);
//pslSort2(outDir, tempDir, noHead);
}
Esempio n. 4
0
void pslSort2(char *outFile, char *tempDir)
/* Do second step of sort - merge all sorted files in tempDir
 * to final. */
{
char fileName[512];
struct slName *tmpList, *tmp;
struct midFile *midList = NULL, *mid;
int aliCount = 0;
FILE *f = mustOpen(outFile, "w");


if (!nohead)
    pslWriteHead(f);
tmpList = listDir(tempDir, "tmp*.psl");
if (tmpList == NULL)
    errAbort("No tmp*.psl files in %s\n", tempDir);
for (tmp = tmpList; tmp != NULL; tmp = tmp->next)
    {
    sprintf(fileName, "%s/%s", tempDir, tmp->name);
    AllocVar(mid);
    mid->lf = pslFileOpen(fileName);
    slAddHead(&midList, mid);
    }
verbose(1, "writing %s", outFile);
fflush(stdout);
/* Write out the lowest sorting line from mid list until done. */
for (;;)
    {
    struct midFile *bestMid = NULL;
    if ( (++aliCount & 0xffff) == 0)
	{
	verboseDot();
	fflush(stdout);
	}
    for (mid = midList; mid != NULL; mid = mid->next)
	{
	if (mid->lf != NULL && mid->psl == NULL)
	    {
	    if ((mid->psl = nextPsl(mid->lf)) == NULL)
		lineFileClose(&mid->lf);
	    }
	if (mid->psl != NULL)
	    {
	    if (bestMid == NULL || pslCmpQuery(&mid->psl, &bestMid->psl) < 0)
		bestMid = mid;
	    }
	}
    if (bestMid == NULL)
	break;
    pslTabOut(bestMid->psl, f);
    pslFree(&bestMid->psl);
    }
printf("\n");
fclose(f);

/* The followint really shouldn't be necessary.... */
for (mid = midList; mid != NULL; mid = mid->next)
    lineFileClose(&mid->lf);

printf("Cleaning up temp files\n");
for (tmp = tmpList; tmp != NULL; tmp = tmp->next)
    {
    sprintf(fileName, "%s/%s", tempDir, tmp->name);
    remove(fileName);
    }
}