const Genome * ReadFASTAGenome( const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize) { // // We need to know a bound on the size of the genome before we create the Genome object. // A bound is the number of bytes in the FASTA file, because we store at most one base per // byte. Get the file size to use for this bound. // _int64 fileSize = QueryFileSize(fileName); bool isValidGenomeCharacter[256]; for (int i = 0; i < 256; i++) { isValidGenomeCharacter[i] = false; } isValidGenomeCharacter['A'] = isValidGenomeCharacter['T'] = isValidGenomeCharacter['C'] = isValidGenomeCharacter['G'] = isValidGenomeCharacter['N'] = true; isValidGenomeCharacter['a'] = isValidGenomeCharacter['t'] = isValidGenomeCharacter['c'] = isValidGenomeCharacter['g'] = isValidGenomeCharacter['n'] = true; FILE *fastaFile = fopen(fileName, "r"); if (fastaFile == NULL) { WriteErrorMessage("Unable to open FASTA file '%s' (even though we already got its size)\n",fileName); return NULL; } int lineBufferSize = 0; char *lineBuffer; // // Count the chromosomes // unsigned nChromosomes = 0; while (NULL != reallocatingFgets(&lineBuffer,&lineBufferSize,fastaFile)) { if (lineBuffer[0] == '>') { nChromosomes++; } } rewind(fastaFile); Genome *genome = new Genome(fileSize + (nChromosomes+1) * (size_t)chromosomePaddingSize, fileSize + (nChromosomes+1) * (size_t)chromosomePaddingSize, chromosomePaddingSize, nChromosomes + 1); char *paddingBuffer = new char[chromosomePaddingSize+1]; for (unsigned i = 0; i < chromosomePaddingSize; i++) { paddingBuffer[i] = 'n'; } paddingBuffer[chromosomePaddingSize] = '\0'; bool warningIssued = false; bool inAContig = false; while (NULL != reallocatingFgets(&lineBuffer, &lineBufferSize, fastaFile)) { if (lineBuffer[0] == '>') { inAContig = true; // // A new contig. Add in the padding first. // genome->addData(paddingBuffer); // // Now supply the chromosome name. // if (NULL != pieceNameTerminatorCharacters) { for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) { char *terminator = strchr(lineBuffer+1, pieceNameTerminatorCharacters[i]); if (NULL != terminator) { *terminator = '\0'; } } } if (spaceIsAPieceNameTerminator) { char *terminator = strchr(lineBuffer, ' '); if (NULL != terminator) { *terminator = '\0'; } terminator = strchr(lineBuffer, '\t'); if (NULL != terminator) { *terminator = '\0'; } } char *terminator = strchr(lineBuffer, '\n'); if (NULL != terminator) { *terminator = '\0'; } terminator = strchr(lineBuffer, '\r'); if (NULL != terminator) { *terminator = '\0'; } genome->startContig(lineBuffer+1); } else { if (!inAContig) { WriteErrorMessage("\nFASTA file doesn't beging with a contig name (i.e., the first line doesn't start with '>').\n"); soft_exit(1); } // // Convert it to upper case and truncate the newline before adding it to the genome. // char *newline = strchr(lineBuffer, '\n'); if (NULL != newline) { *newline = 0; } // // But convert any 'N' to 'n'. This is so we don't match the N from the genome with N // in reads (where we just do a straight text comparison. // size_t lineLen = strlen(lineBuffer); for (unsigned i = 0; i < lineLen; i++) { lineBuffer[i] = toupper(lineBuffer[i]); } for (unsigned i = 0; i < lineLen; i++) { if ('N' == lineBuffer[i]) { lineBuffer[i] = 'n'; } if (!isValidGenomeCharacter[(unsigned char)lineBuffer[i]]) { if (!warningIssued) { WriteErrorMessage("\nFASTA file contained a character that's not a valid base (or N): '%c', full line '%s'; \nconverting to 'N'. This may happen again, but there will be no more warnings.\n", lineBuffer[i], lineBuffer); warningIssued = true; } lineBuffer[i] = 'N'; } } genome->addData(lineBuffer); } } // // And finally add padding at the end of the genome. // genome->addData(paddingBuffer); genome->fillInContigLengths(); genome->sortContigsByName(); fclose(fastaFile); delete [] paddingBuffer; delete [] lineBuffer; return genome; }
const Genome * Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, unsigned i_minOffset, unsigned length) { GenericFile *loadFile; unsigned nBases,nContigs; if (!openFileAndGetSizes(fileName,&loadFile,&nBases,&nContigs)) { // // It already printed an error. Just fail. // return NULL; } if (0 == length) { length = nBases - i_minOffset; } else { // // Don't let length go beyond nBases. // length = __min(length,nBases - i_minOffset); } Genome *genome = new Genome(nBases,length, chromosomePadding); genome->nBases = nBases; genome->nContigs = genome->maxContigs = nContigs; genome->contigs = new Contig[nContigs]; genome->minOffset = i_minOffset; if (i_minOffset >= nBases) { WriteErrorMessage("Genome::loadFromFile: specified minOffset %u >= nBases %u\n",i_minOffset,nBases); } genome->maxOffset = i_minOffset + length; static const unsigned contigNameBufferSize = 512; char contigNameBuffer[contigNameBufferSize]; unsigned n; size_t contigSize; char *curName; for (unsigned i = 0; i < nContigs; i++) { if (NULL == loadFile->gets(contigNameBuffer, contigNameBufferSize)){ WriteErrorMessage("Unable to read contig description\n"); delete genome; return NULL; } for (n = 0; n < contigNameBufferSize; n++){ if (contigNameBuffer[n] == ' ') { contigNameBuffer[n] = '\0'; break; } } genome->contigs[i].beginningOffset = atoi(contigNameBuffer); contigNameBuffer[n] = ' '; n++; // increment n so we start copying at the position after the space contigSize = strlen(contigNameBuffer + n) - 1; //don't include the final \n genome->contigs[i].name = new char[contigSize + 1]; genome->contigs[i].nameLength = (unsigned)contigSize; curName = genome->contigs[i].name; for (unsigned pos = 0; pos < contigSize; pos++) { curName[pos] = contigNameBuffer[pos + n]; } curName[contigSize] = '\0'; } // // Skip over the miserable \n that gets left in the file. // /* char newline; if (1 != fread(&newline,1,1,loadFile)) { WriteErrorMessage("Genome::loadFromFile: Unable to read expected newline\n"); delete genome; return NULL; } if (newline != 10) { WriteErrorMessage("Genome::loadFromFile: Expected newline to be 0x0a, got 0x%02x\n",newline); delete genome; return NULL; } */ if (0 != loadFile->advance(i_minOffset)) { WriteErrorMessage("Genome::loadFromFile: _fseek64bit failed\n"); soft_exit(1); } size_t retval; if (length != (retval = loadFile->read(genome->bases,length))) { WriteErrorMessage("Genome::loadFromFile: fread of bases failed; wanted %u, got %d\n", length, retval); loadFile->close(); delete loadFile; delete genome; return NULL; } loadFile->close(); delete loadFile; genome->fillInContigLengths(); genome->sortContigsByName(); return genome; }
// // Makes a copy of a Genome, but with only one of the sex chromosomes. // // The fate of the mitochondrion is that of the X chromosome. // Genome * Genome::copy(bool copyX, bool copyY, bool copyM) const { Genome *newCopy = new Genome(getCountOfBases(),getCountOfBases(), chromosomePadding); if (NULL == newCopy) { WriteErrorMessage("Genome::copy: failed to allocate space for copy.\n"); return NULL; } const Genome::Contig *currentContig = NULL; const Genome::Contig *nextContig = getContigAtLocation(0); unsigned offsetInReference = 0; while (offsetInReference < getCountOfBases()) { if (NULL != nextContig && offsetInReference >= nextContig->beginningOffset) { // // Start of a new contig. See if we want to skip it. // currentContig = nextContig; nextContig = getNextContigAfterLocation(offsetInReference + 1); if ((!copyX && !strcmp(currentContig->name,"chrX")) || (!copyY && !strcmp(currentContig->name,"chrY")) || (!copyM && !strcmp(currentContig->name,"chrM"))) { // // Yes, skip over this contig. // nextContig = getNextContigAfterLocation(offsetInReference + 1); if (NULL == nextContig) { // // The chromosome that we're skipping was the last one, so we're done. // break; } else { offsetInReference = nextContig->beginningOffset; continue; } } // If skipping this chromosome newCopy->startContig(currentContig->name); } // If new contig beginning const size_t maxCopySize = 10000; char dataBuffer[maxCopySize + 1]; unsigned amountToCopy = maxCopySize; if (nextContig && nextContig->beginningOffset < offsetInReference + amountToCopy) { amountToCopy = nextContig->beginningOffset - offsetInReference; } if (getCountOfBases() < offsetInReference + amountToCopy) { amountToCopy = getCountOfBases() - offsetInReference; } memcpy(dataBuffer,getSubstring(offsetInReference,amountToCopy), amountToCopy); dataBuffer[amountToCopy] = '\0'; newCopy->addData(dataBuffer); offsetInReference += amountToCopy; } newCopy->fillInContigLengths(); newCopy->sortContigsByName(); return newCopy; }
const Genome * Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLocation minLocation, GenomeDistance length, bool map) { GenericFile *loadFile; GenomeDistance nBases; unsigned nContigs; if (!openFileAndGetSizes(fileName, &loadFile, &nBases, &nContigs, map)) { // // It already printed an error. Just fail. // return NULL; } GenomeLocation maxLocation(nBases); if (0 == length) { length = maxLocation - minLocation; } else { // // Don't let length go beyond nBases. // length = __min(length, maxLocation - minLocation); maxLocation = minLocation + length; } Genome *genome = new Genome(nBases, length, chromosomePadding); genome->nBases = nBases; genome->nContigs = genome->maxContigs = nContigs; genome->contigs = new Contig[nContigs]; genome->minLocation = minLocation; if (GenomeLocationAsInt64(minLocation) >= nBases) { WriteErrorMessage("Genome::loadFromFile: specified minOffset %u >= nBases %u\n", GenomeLocationAsInt64(minLocation), nBases); soft_exit(-1); } genome->maxLocation = maxLocation; static const unsigned contigNameBufferSize = 512; char contigNameBuffer[contigNameBufferSize]; unsigned n; size_t contigSize; char *curName; for (unsigned i = 0; i < nContigs; i++) { if (NULL == loadFile->gets(contigNameBuffer, contigNameBufferSize)){ WriteErrorMessage("Unable to read contig description\n"); delete genome; return NULL; } for (n = 0; n < contigNameBufferSize; n++){ if (contigNameBuffer[n] == ' ') { contigNameBuffer[n] = '\0'; break; } } _int64 contigStart; if (1 != sscanf(contigNameBuffer, "%lld", &contigStart)) { WriteErrorMessage("Unable to parse contig start in genome file '%s', '%s%'\n", fileName, contigNameBuffer); soft_exit(1); } genome->contigs[i].beginningLocation = GenomeLocation(contigStart); contigNameBuffer[n] = ' '; n++; // increment n so we start copying at the position after the space contigSize = strlen(contigNameBuffer + n) - 1; //don't include the final \n genome->contigs[i].name = new char[contigSize + 1]; genome->contigs[i].nameLength = (unsigned)contigSize; curName = genome->contigs[i].name; for (unsigned pos = 0; pos < contigSize; pos++) { curName[pos] = contigNameBuffer[pos + n]; } curName[contigSize] = '\0'; } if (0 != loadFile->advance(GenomeLocationAsInt64(minLocation))) { WriteErrorMessage("Genome::loadFromFile: _fseek64bit failed\n"); soft_exit(1); } size_t readSize; if (map) { GenericFile_map *mappedFile = (GenericFile_map *)loadFile; genome->bases = (char *)mappedFile->mapAndAdvance(length, &readSize); genome->mappedFile = mappedFile; mappedFile->prefetch(); } else { readSize = loadFile->read(genome->bases, length); loadFile->close(); delete loadFile; loadFile = NULL; } if (length != readSize) { WriteErrorMessage("Genome::loadFromFile: fread of bases failed; wanted %u, got %d\n", length, readSize); delete loadFile; delete genome; return NULL; } genome->fillInContigLengths(); genome->sortContigsByName(); return genome; }