Example #1
0
void pslSortAcc(char *command, char *outDir, char *tempDir, char *inFiles[], int inFileCount)
/* Do the two step sort. */
{
int chunkSize = 250000;	/* Do this many lines at once. */
int linesLeftInChunk = chunkSize;
int i;
char *inFile;
int totalLineCount = 0;
int midFileCount = 0;
struct lineFile *lf;
struct psl *psl, *pslList = NULL;
boolean noHead = (sameWord(command, "nohead"));

mkdir(outDir, 0775);
mkdir(tempDir, 0775);

/* Read in input and scatter it into sorted
 * temporary files. */
for (i = 0; i<inFileCount; ++i)
    {
    inFile = inFiles[i];
    printf("Processing %s", inFile);
    fflush(stdout);
    lf = pslFileOpen(inFile);
    while ((psl = nextPsl(lf)) != NULL)
	{
	slAddHead(&pslList, psl);
	if (--linesLeftInChunk <= 0)
	    {
	    outputChunk(&pslList, tempDir, midFileCount++);
	    linesLeftInChunk = chunkSize;
	    }
	if ((++totalLineCount & 0xffff) == 0)
	    {
	    printf(".");
	    fflush(stdout);
	    }
	}
    printf("\n");
    lineFileClose(&lf);
    }
outputChunk(&pslList, tempDir, midFileCount++);
printf("Processed %d lines into %d temp files\n", totalLineCount, midFileCount);
pslSort2(outDir, tempDir, noHead);
}
Example #2
0
void pslSort(char *command, char *outFile, char *tempDir, char *inDirs[], int inDirCount)
/* Do the two step sort. */
{
int i;
struct slName *fileList = NULL, *name;
char *inDir;
struct slName *dirDir, *dirFile;
char fileName[512];
int fileCount;
int totalFilesProcessed = 0;
int filesPerMidFile;
int midFileCount = 0;
FILE *f;
struct lineFile *lf;
boolean doReflect = FALSE;
boolean suppressSelf = FALSE;
boolean firstOnly = endsWith(command, "1");
boolean secondOnly = endsWith(command, "2");

if (startsWith("dirs", command))
    ;
else if (startsWith("g2g", command))
    {
    doReflect = TRUE;
    suppressSelf = TRUE;
    }
else
    usage();


if (!secondOnly)
    {
    makeDir(tempDir);
    /* Figure out how many files to process. */
    for (i=0; i<inDirCount; ++i)
	{
	inDir = inDirs[i];
	dirDir = listDir(inDir, "*.psl");
	if (slCount(dirDir) == 0)
	    dirDir = listDir(inDir, "*.psl.gz");
	if (slCount(dirDir) == 0)
	    errAbort("No psl files in %s\n", inDir);
	verbose(1, "%s with %d files\n", inDir, slCount(dirDir));
	for (dirFile = dirDir; dirFile != NULL; dirFile = dirFile->next)
	    {
	    sprintf(fileName, "%s/%s", inDir, dirFile->name);
	    name = newSlName(fileName);
	    slAddHead(&fileList, name);
	    }
	slFreeList(&dirDir);
	}
    verbose(1, "%d files in %d dirs\n", slCount(fileList), inDirCount);
    slReverse(&fileList);
    fileCount = slCount(fileList);
    filesPerMidFile = round(sqrt(fileCount));
    // if (filesPerMidFile > 20)
	// filesPerMidFile = 20;  /* bandaide! Should keep track of mem usage. */
    verbose(1, "Got %d files %d files per mid file\n", fileCount, filesPerMidFile);

    /* Read in files a group at a time, sort, and write merged, sorted
     * output of one group. */
    name = fileList;
    while (totalFilesProcessed < fileCount)
	{
	int filesInMidFile = 0;
	struct psl *pslList = NULL, *psl;
	int lfileCount = 0;
	struct lm *lm = lmInit(256*1024);

	for (filesInMidFile = 0; filesInMidFile < filesPerMidFile && name != NULL;
	    ++filesInMidFile, ++totalFilesProcessed, name = name->next)
	    {
	    boolean reflectMe = FALSE;
	    if (doReflect)
		{
		reflectMe = !selfFile(name->name);
		}
	    verbose(2, "Reading %s (%d of %d)\n", name->name, totalFilesProcessed+1, fileCount);
	    lf = pslFileOpen(name->name);
	    while ((psl = nextLmPsl(lf, lm)) != NULL)
		{
		if (psl->qStart == psl->tStart && psl->strand[0] == '+' && 
		    suppressSelf && sameString(psl->qName, psl->tName))
		    {
		    continue;
		    }
		++lfileCount;
		slAddHead(&pslList, psl);
		if (reflectMe)
		    {
		    psl = mirrorLmPsl(psl, lm);
		    slAddHead(&pslList, psl);
		    }
		}
	    lineFileClose(&lf);
	    }
	slSort(&pslList, pslCmpQuery);
	makeMidName(tempDir, midFileCount, fileName);
	verbose(1, "Writing %s\n", fileName);
	f = mustOpen(fileName, "w");
	if (!nohead)
	    pslWriteHead(f);
	for (psl = pslList; psl != NULL; psl = psl->next)
	    {
	    pslTabOut(psl, f);
	    }
	fclose(f);
	pslList = NULL;
	lmCleanup(&lm);
	verbose(2, "lfileCount %d\n", lfileCount);
	++midFileCount;
	}
    }
if (!firstOnly)
    pslSort2(outFile, tempDir);
}