void pslSortAcc(char *command, char *outDir, char *tempDir, char *inFiles[], int inFileCount) /* Do the two step sort. */ { int chunkSize = 250000; /* Do this many lines at once. */ int linesLeftInChunk = chunkSize; int i; char *inFile; int totalLineCount = 0; int midFileCount = 0; struct lineFile *lf; struct psl *psl, *pslList = NULL; boolean noHead = (sameWord(command, "nohead")); mkdir(outDir, 0775); mkdir(tempDir, 0775); /* Read in input and scatter it into sorted * temporary files. */ for (i = 0; i<inFileCount; ++i) { inFile = inFiles[i]; printf("Processing %s", inFile); fflush(stdout); lf = pslFileOpen(inFile); while ((psl = nextPsl(lf)) != NULL) { slAddHead(&pslList, psl); if (--linesLeftInChunk <= 0) { outputChunk(&pslList, tempDir, midFileCount++); linesLeftInChunk = chunkSize; } if ((++totalLineCount & 0xffff) == 0) { printf("."); fflush(stdout); } } printf("\n"); lineFileClose(&lf); } outputChunk(&pslList, tempDir, midFileCount++); printf("Processed %d lines into %d temp files\n", totalLineCount, midFileCount); pslSort2(outDir, tempDir, noHead); }
void pslSort(char *command, char *outFile, char *tempDir, char *inDirs[], int inDirCount) /* Do the two step sort. */ { int i; struct slName *fileList = NULL, *name; char *inDir; struct slName *dirDir, *dirFile; char fileName[512]; int fileCount; int totalFilesProcessed = 0; int filesPerMidFile; int midFileCount = 0; FILE *f; struct lineFile *lf; boolean doReflect = FALSE; boolean suppressSelf = FALSE; boolean firstOnly = endsWith(command, "1"); boolean secondOnly = endsWith(command, "2"); if (startsWith("dirs", command)) ; else if (startsWith("g2g", command)) { doReflect = TRUE; suppressSelf = TRUE; } else usage(); if (!secondOnly) { makeDir(tempDir); /* Figure out how many files to process. */ for (i=0; i<inDirCount; ++i) { inDir = inDirs[i]; dirDir = listDir(inDir, "*.psl"); if (slCount(dirDir) == 0) dirDir = listDir(inDir, "*.psl.gz"); if (slCount(dirDir) == 0) errAbort("No psl files in %s\n", inDir); verbose(1, "%s with %d files\n", inDir, slCount(dirDir)); for (dirFile = dirDir; dirFile != NULL; dirFile = dirFile->next) { sprintf(fileName, "%s/%s", inDir, dirFile->name); name = newSlName(fileName); slAddHead(&fileList, name); } slFreeList(&dirDir); } verbose(1, "%d files in %d dirs\n", slCount(fileList), inDirCount); slReverse(&fileList); fileCount = slCount(fileList); filesPerMidFile = round(sqrt(fileCount)); // if (filesPerMidFile > 20) // filesPerMidFile = 20; /* bandaide! Should keep track of mem usage. */ verbose(1, "Got %d files %d files per mid file\n", fileCount, filesPerMidFile); /* Read in files a group at a time, sort, and write merged, sorted * output of one group. */ name = fileList; while (totalFilesProcessed < fileCount) { int filesInMidFile = 0; struct psl *pslList = NULL, *psl; int lfileCount = 0; struct lm *lm = lmInit(256*1024); for (filesInMidFile = 0; filesInMidFile < filesPerMidFile && name != NULL; ++filesInMidFile, ++totalFilesProcessed, name = name->next) { boolean reflectMe = FALSE; if (doReflect) { reflectMe = !selfFile(name->name); } verbose(2, "Reading %s (%d of %d)\n", name->name, totalFilesProcessed+1, fileCount); lf = pslFileOpen(name->name); while ((psl = nextLmPsl(lf, lm)) != NULL) { if (psl->qStart == psl->tStart && psl->strand[0] == '+' && suppressSelf && sameString(psl->qName, psl->tName)) { continue; } ++lfileCount; slAddHead(&pslList, psl); if (reflectMe) { psl = mirrorLmPsl(psl, lm); slAddHead(&pslList, psl); } } lineFileClose(&lf); } slSort(&pslList, pslCmpQuery); makeMidName(tempDir, midFileCount, fileName); verbose(1, "Writing %s\n", fileName); f = mustOpen(fileName, "w"); if (!nohead) pslWriteHead(f); for (psl = pslList; psl != NULL; psl = psl->next) { pslTabOut(psl, f); } fclose(f); pslList = NULL; lmCleanup(&lm); verbose(2, "lfileCount %d\n", lfileCount); ++midFileCount; } } if (!firstOnly) pslSort2(outFile, tempDir); }