void BlockMgr::getBlocksFromBed12(RecordKeyVector &keyList, bool &mustDelete) { const Bed12Interval *keyRecord = static_cast<const Bed12Interval *>(keyList.getKey()); int blockCount = keyRecord->getBlockCount(); if ( blockCount <= 0 ) { mustDelete = false; return; } int sizeCount = _blockSizeTokens.tokenize(keyRecord->getBlockSizes(), ','); int startCount = _blockStartTokens.tokenize(keyRecord->getBlockStarts(), ','); if (blockCount != sizeCount || sizeCount != startCount) { fprintf(stderr, "Error: found wrong block counts while splitting entry.\n"); exit(-1); } for (int i=0; i < blockCount; i++) { int startPos = keyRecord->getStartPos() + str2chrPos(_blockStartTokens.getElem(i).c_str()); int endPos = startPos + str2chrPos(_blockSizeTokens.getElem(i).c_str()); Record *record = allocateAndAssignRecord(keyRecord, startPos, endPos); keyList.push_back(record); } mustDelete = true; }
bool GroupBy::init() { Tokenizer groupColsTokens; groupColsTokens.tokenize(upCast(_context)->getGroupCols(), ','); int numElems = groupColsTokens.getNumValidElems(); for (int i=0; i < numElems; i++) { //if the item is a range, such as 3-5, //must split that as well. const QuickString &elem = groupColsTokens.getElem(i); if (strchr(elem.c_str(), '-')) { Tokenizer rangeElems; rangeElems.tokenize(elem, '-'); int startNum = str2chrPos(rangeElems.getElem(0)); int endNum = str2chrPos(rangeElems.getElem(1)); for (int i=startNum; i <= endNum; i++) { _groupCols.push_back(i); } } else { _groupCols.push_back(str2chrPos(elem)); } } _queryFRM = _context->getFile(0); _prevFields.resize(_groupCols.size()); _prevRecord = getNextRecord(); return true; }
bool GffRecord::initFromFile(SingleLineDelimTextFileReader *fileReader) { setFileIdx(fileReader->getFileIdx()); fileReader->getField(0, _chrName); fileReader->getField(3, _startPosStr); _startPos = str2chrPos(_startPosStr); _startPos--; // VCF is one-based. Here we intentionally don't decrement the string version, //because we'll still want to output the one-based number in the print methods, even though //internally we decrement the integer to comply with the 0-based format common to other records. fileReader->getField(4, _endPosStr); //endPos is just the startPos plus the length of the variant _endPos = str2chrPos(_endPosStr); fileReader->getField(2, _name); fileReader->getField(1, _source); fileReader->getField(5, _score); //GFF allows a '.' for the strandChar, signifying it is not known. fileReader->getField(6, _strand); adjustStrandVal(); fileReader->getField(7, _frame); _numFields = fileReader->getNumFields(); if (_numFields == 9) { fileReader->getField(8, _group); } return true; }
bool Bed3Interval::initFromFile(SingleLineDelimTextFileReader *fileReader) { fileReader->getField(0, _chrName); fileReader->getField(1, _startPosStr); fileReader->getField(2, _endPosStr); _startPos = str2chrPos(_startPosStr); _endPos = str2chrPos(_endPosStr); return true; }
bool VcfRecord::initFromFile(SingleLineDelimTextFileReader *fileReader) { setFileIdx(fileReader->getFileIdx()); fileReader->getField(0, _chrName); _chrId = fileReader->getCurrChromdId(); fileReader->getField(1, _startPosStr); _startPos = str2chrPos(_startPosStr); _startPos--; // VCF is one-based. Here we intentionally don't decrement the string version, //because we'll still want to output the one-based number in the print methods, even though //internally we decrement the integer to comply with the 0-based format common to other records. fileReader->getField(4, _varAlt); fileReader->getField(3, _varRef); if (_varAlt[0] == '<') { //this is a structural variant. Need to parse the tags to find the endpos. _endPos = _startPos + fileReader->getVcfSVlen(); } else { //endPos is just the startPos plus the length of the variant _endPos = _startPos + _varRef.size(); } int2str(_endPos, _endPosStr); fileReader->getField(2, _name); fileReader->getField(5, _score); return initOtherFieldsFromFile(fileReader); }
bool FileRecordTypeChecker::isGFFformat() { //a GFF file may have 8 or 9 fields. if (_numFields < 7 || _numFields > 9) { return false; } //the 4th and 5th fields must be numeric. if (!isNumeric(_tokenizer.getElem(3)) || !isNumeric(_tokenizer.getElem(4))) { return false; } int start = str2chrPos(_tokenizer.getElem(3)); int end = str2chrPos(_tokenizer.getElem(4)); if (end < start) { return false; } return true; }
bool FileRecordTypeChecker::isBedFormat() { //test that the file has at least three fields. //2nd and 3rd fields of first valid data line must be integers. 3rd must not be less than 2nd. if (_numFields < 3) { return false; } //the 2nd and 3rd fields must be numeric. if (!isNumeric(_tokenizer.getElem(1)) || !isNumeric(_tokenizer.getElem(2))) { return false; } int start = str2chrPos(_tokenizer.getElem(1)); int end = str2chrPos(_tokenizer.getElem(2)); if (end < start) { return false; } return true; }
CHRPOS SingleLineDelimTextFileReader::getVcfSVlen() { // tokenize the INFO field string info_str; vector<string> infofields; getField(VCF_TAG_FIELD, info_str); Tokenize(info_str, infofields, ';'); // FOO=BAR;BIZ=BAM;SVLEN=100;END=200 for (vector<string>::iterator f = infofields.begin(); f != infofields.end(); ++f) { if (*f == ".") { continue; } vector<string> keytoval; Tokenize(*f, keytoval, '='); //hey->val //SVLEN->100 if (keytoval.size() == 2) { if (keytoval.at(0) == "SVLEN") { vector<CHRPOS> svlens; Tokenize(keytoval.at(1), svlens, ','); // are the multiple SVLENS? if (svlens.size() == 1) { return abs(svlens[0]); } else { // return the abs_max SVLEN CHRPOS max_len = *max_element(svlens.begin(),svlens.end(), abs_cmp); return abs(max_len); } } else if (keytoval.at(0) == "END") { string start_str; getField(1, start_str); // length is END - POS + 1 return (int)(str2chrPos(keytoval.at(1)) - str2chrPos(start_str) + 1); } } } // not found return INT_MIN; }
bool ContextMerge::handle_d() { if ((_i+1) < _argc) { if (isNumeric(_argv[_i+1])) { int dist = str2chrPos(_argv[_i+1]); _maxDistance = dist; markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); return true; } } _errorMsg = "\n***** ERROR: -d option must be followed by an integer value *****"; return false; }
void NewGenomeFile::loadGenomeFileIntoMap() { ifstream genFile(_genomeFileName.c_str()); if (!genFile.good()) { cerr << "Error: Can't open genome file" << _genomeFileName << "Exiting..." << endl; exit(1); } string sLine; Tokenizer fieldTokens; CHRPOS chrSize = 0; string chrName; while (!genFile.eof()) { sLine.clear(); chrSize = 0; chrName.clear(); getline(genFile, sLine); int numFields = fieldTokens.tokenize(sLine.c_str()); // allow use of .fai files. if (numFields < 2) { continue; } chrName = fieldTokens.getElem(0); chrSize = str2chrPos(fieldTokens.getElem(1)); _maxId++; _chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId); _startOffsets.push_back(_genomeLength); _genomeLength += chrSize; _chromList.push_back(chrName); } if (_maxId == -1) { cerr << "Error: The genome file " << _genomeFileName << " has no valid entries. Exiting." << endl; exit(1); } // Special: BAM files can have unmapped reads, which show as no chromosome, or an empty chrom string. // Add in an empty chrom so these don't error. _maxId++; _chromSizeIds[""] = pair<CHRPOS, int>(0, _maxId); _chromList.push_back(""); _startOffsets.push_back(_genomeLength); //insert the final length as the last element //to help with the lower_bound call in the projectOnGenome method. genFile.close(); }
void NewGenomeFile::loadGenomeFileIntoMap() { ifstream genFile(_genomeFileName.c_str()); if (!genFile.good()) { cerr << "Error: Can't open genome file" << _genomeFileName << "Exiting..." << endl; exit(1); } string sLine; Tokenizer fieldTokens; CHRPOS chrSize = 0; QuickString chrName; while (!genFile.eof()) { sLine.clear(); chrSize = 0; chrName.clear(); getline(genFile, sLine); int numFields = fieldTokens.tokenize(sLine.c_str()); if (numFields != 2) { continue; } chrName = fieldTokens.getElem(0); chrSize = str2chrPos(fieldTokens.getElem(1)); _maxId++; _chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId); _startOffsets.push_back(_genomeLength); _genomeLength += chrSize; _chromList.push_back(chrName); } if (_maxId == -1) { cerr << "Error: The genome file " << _genomeFileName << " has no valid entries. Exiting." << endl; exit(1); } _startOffsets.push_back(_genomeLength); //insert the final length as the last element //to help with the lower_bound call in the projectOnGenome method. genFile.close(); }
bool ContextBase::parseIoBufSize(string bufStr) { char lastChar = bufStr[bufStr.size()-1]; int multiplier = 1; if (!isdigit(lastChar)) { switch (lastChar) { case 'K': multiplier = 1 << 10; break; case 'M': multiplier = 1 << 20; break; case 'G': multiplier = 1 << 30; break; default: _errorMsg = "\n***** ERROR: Unrecognized memory buffer size suffix \'"; _errorMsg += lastChar; _errorMsg += "\' given. *****"; return false; break; } //lop off suffix character bufStr.resize(bufStr.size()-1); } if (!isNumeric(bufStr)) { _errorMsg = "\n***** ERROR: argument passed to -iobuf is not numeric. *****"; return false; } _ioBufSize = str2chrPos(bufStr) * multiplier; if (_ioBufSize < MIN_ALLOWED_BUF_SIZE) { _errorMsg = "\n***** ERROR: specified buffer size is too small. *****"; return false; } return true; }
void SingleLineDelimTextFileReader::getField(int fieldNum, int &val) { getField(fieldNum, _tempChrPosStr); val = str2chrPos(_tempChrPosStr.c_str()); }
void SingleLineDelimTextFileReader::getField(int fieldNum, CHRPOS &val) const { string temp; getField(fieldNum, temp); val = str2chrPos(temp.c_str()); }
bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) { //get the strings from context containing the comma-delimited lists of columns //and operations. Split both of these into vectors. Get the operation code //for each operation string. Finally, make a vector of pairs, where the first //member of each pair is a column number, and the second member is the code for the //operation to perform on that column. Tokenizer colTokens; Tokenizer opsTokens; int numCols = colTokens.tokenize(_columns, ','); int numOps = opsTokens.tokenize(_operations, ','); if (numOps < 1 || numCols < 1) { cerr << endl << "*****" << endl << "***** ERROR: There must be at least one column and at least one operation named." << endl; return false; } if (numOps > 1 && numCols > 1 && numCols != numOps) { cerr << endl << "*****" << endl << "***** ERROR: There are " << numCols <<" columns given, but there are " << numOps << " operations." << endl; cerr << "\tPlease provide either a single operation that will be applied to all listed columns, " << endl; cerr << "\ta single column to which all operations will be applied," << endl; cerr << "\tor an operation for each column." << endl; return false; } int loop = max(numCols, numOps); // If there is only one column, all ops are performed on it. // Otherwise, if there is only op, it is performed on all columns. // Besides that, ops are performed on columns in their respective // ordering. for (int i=0; i < loop; i++) { int col = str2chrPos(colTokens.getElem(numCols > 1 ? i : 0)); //check that the column number is valid if (col < 1 || col > dbFile->getNumFields()) { cerr << endl << "*****" << endl << "***** ERROR: Requested column " << col << ", but database file " << dbFile->getFileName() << " only has fields 1 - " << dbFile->getNumFields() << "." << endl; return false; } const QuickString &operation = opsTokens.getElem(numOps > 1 ? i : 0); OP_TYPES opCode = getOpCode(operation); if (opCode == INVALID) { cerr << endl << "*****" << endl << "***** ERROR: " << operation << " is not a valid operation. " << endl; return false; } _colOps.push_back(pair<int, OP_TYPES>(col, opCode)); } //lastly, if the file is BAM, and they asked for column 2, which is the //flags field, then for now we have to throw an error, as the flag field //is currently not supported. if (_dbFileType == FileRecordTypeChecker::BAM_FILE_TYPE) { //also, tell the methods class we're dealing with BAM. _methods.setIsBam(true); for (size_t i = 0; i < _colOps.size(); i++) { if (_colOps[i].first == 2) { cerr << endl << "*****" << endl << "***** ERROR: Requested column 2 of a BAM file, which is the Flags field." << endl; cerr << " We currently do not support this, but may in future versions." << endl; return false; } } } return true; }
int str2chrPos(const string &str) { return str2chrPos(str.c_str(), str.size()); }