void pslSortAcc(char *command, char *outDir, char *tempDir, char *inFiles[], int inFileCount) /* Do the two step sort. */ { int chunkSize = 250000; /* Do this many lines at once. */ int linesLeftInChunk = chunkSize; int i; char *inFile; int totalLineCount = 0; int midFileCount = 0; struct lineFile *lf; struct psl *psl, *pslList = NULL; boolean noHead = (sameWord(command, "nohead")); mkdir(outDir, 0775); mkdir(tempDir, 0775); /* Read in input and scatter it into sorted * temporary files. */ for (i = 0; i<inFileCount; ++i) { inFile = inFiles[i]; printf("Processing %s", inFile); fflush(stdout); lf = pslFileOpen(inFile); while ((psl = nextPsl(lf)) != NULL) { slAddHead(&pslList, psl); if (--linesLeftInChunk <= 0) { outputChunk(&pslList, tempDir, midFileCount++); linesLeftInChunk = chunkSize; } if ((++totalLineCount & 0xffff) == 0) { printf("."); fflush(stdout); } } printf("\n"); lineFileClose(&lf); } outputChunk(&pslList, tempDir, midFileCount++); printf("Processed %d lines into %d temp files\n", totalLineCount, midFileCount); pslSort2(outDir, tempDir, noHead); }
void pslSort2(char *outDir, char *tempDir, boolean noHead) /* Do second step of sort - merge all sorted files in tempDir * to final outdir. */ { char fileName[512]; struct slName *tmpList, *tmp; struct midFile *midList = NULL, *mid; int aliCount = 0; FILE *f = NULL; char lastTargetAcc[256]; char targetAcc[256]; strcpy(lastTargetAcc, ""); tmpList = listDir(tempDir, "tmp*.psl"); if (tmpList == NULL) errAbort("No tmp*.psl files in %s\n", tempDir); for (tmp = tmpList; tmp != NULL; tmp = tmp->next) { sprintf(fileName, "%s/%s", tempDir, tmp->name); AllocVar(mid); mid->lf = pslFileOpen(fileName); slAddHead(&midList, mid); } printf("writing %s", outDir); fflush(stdout); /* Write out the lowest sorting line from mid list until done. */ for (;;) { struct midFile *bestMid = NULL; if ( (++aliCount & 0xffff) == 0) { printf("."); fflush(stdout); } for (mid = midList; mid != NULL; mid = mid->next) { if (mid->lf != NULL && mid->psl == NULL) { if ((mid->psl = nextPsl(mid->lf)) == NULL) lineFileClose(&mid->lf); } if (mid->psl != NULL) { if (bestMid == NULL || pslCmpTarget(&mid->psl, &bestMid->psl) < 0) bestMid = mid; } } if (bestMid == NULL) break; getTargetAcc(bestMid->psl->tName, targetAcc); if (!sameString(targetAcc, lastTargetAcc)) { strcpy(lastTargetAcc, targetAcc); carefulClose(&f); sprintf(fileName, "%s/%s.psl", outDir, targetAcc); f = mustOpen(fileName, "w"); if (!noHead) pslWriteHead(f); } pslTabOut(bestMid->psl, f); pslFree(&bestMid->psl); } carefulClose(&f); printf("\n"); printf("Cleaning up temp files\n"); for (tmp = tmpList; tmp != NULL; tmp = tmp->next) { sprintf(fileName, "%s/%s", tempDir, tmp->name); remove(fileName); } }
void pslSplit(char *command, char *outDir, char *inFiles[], int inFileCount) /* pslSplit - "pslSplit - split into multiple output files by qName.*/ { int linesLeftInChunk = chunkSize; int i; char *inFile; char fileName[512]; int fileCount; int totalLineCount = 0; int midFileCount = 0; FILE *f; struct lineFile *lf; char *line; char *prev = cloneString("first"); int lineSize; struct psl *psl, *pslList = NULL; boolean noHead = (sameWord(command, "nohead")); mkdir(outDir, 0775); /* Read in presorted input and scatter it into sorted * temporary files. */ for (i = 0; i<inFileCount; ++i) { int linesLeft = maxLines; bool breakNext = FALSE; //char name[512]; inFile = inFiles[i]; printf("Processing %s", inFile); fflush(stdout); lf = pslFileOpen(inFile); psl = nextPsl(lf) ; prev = cloneString(psl->qName); slAddHead(&pslList, psl); while ((psl = nextPsl(lf)) != NULL) { //safef(name, sizeof(name), "%s",psl->qName); //chopSuffix(name); if (!sameString(prev, psl->qName)) { prev = cloneString(psl->qName); if (--linesLeftInChunk <= 0 || breakNext) { outputChunk(&pslList, outDir, midFileCount++, noHead); linesLeftInChunk = chunkSize; linesLeft = maxLines; breakNext = FALSE; } } if (--linesLeft < 0) { breakNext = TRUE; } if ((++totalLineCount & 0xffff) == 0) { printf("."); fflush(stdout); } slAddHead(&pslList, psl); //freeMem(&prev); } printf("\n"); lineFileClose(&lf); } outputChunk(&pslList, outDir, midFileCount++, noHead); printf("Processed %d lines into %d output files\n", totalLineCount, midFileCount); //pslSort2(outDir, tempDir, noHead); }
void pslSort2(char *outFile, char *tempDir) /* Do second step of sort - merge all sorted files in tempDir * to final. */ { char fileName[512]; struct slName *tmpList, *tmp; struct midFile *midList = NULL, *mid; int aliCount = 0; FILE *f = mustOpen(outFile, "w"); if (!nohead) pslWriteHead(f); tmpList = listDir(tempDir, "tmp*.psl"); if (tmpList == NULL) errAbort("No tmp*.psl files in %s\n", tempDir); for (tmp = tmpList; tmp != NULL; tmp = tmp->next) { sprintf(fileName, "%s/%s", tempDir, tmp->name); AllocVar(mid); mid->lf = pslFileOpen(fileName); slAddHead(&midList, mid); } verbose(1, "writing %s", outFile); fflush(stdout); /* Write out the lowest sorting line from mid list until done. */ for (;;) { struct midFile *bestMid = NULL; if ( (++aliCount & 0xffff) == 0) { verboseDot(); fflush(stdout); } for (mid = midList; mid != NULL; mid = mid->next) { if (mid->lf != NULL && mid->psl == NULL) { if ((mid->psl = nextPsl(mid->lf)) == NULL) lineFileClose(&mid->lf); } if (mid->psl != NULL) { if (bestMid == NULL || pslCmpQuery(&mid->psl, &bestMid->psl) < 0) bestMid = mid; } } if (bestMid == NULL) break; pslTabOut(bestMid->psl, f); pslFree(&bestMid->psl); } printf("\n"); fclose(f); /* The followint really shouldn't be necessary.... */ for (mid = midList; mid != NULL; mid = mid->next) lineFileClose(&mid->lf); printf("Cleaning up temp files\n"); for (tmp = tmpList; tmp != NULL; tmp = tmp->next) { sprintf(fileName, "%s/%s", tempDir, tmp->name); remove(fileName); } }