bool GroupBy::init() { Tokenizer groupColsTokens; groupColsTokens.tokenize(upCast(_context)->getGroupCols(), ','); int numElems = groupColsTokens.getNumValidElems(); for (int i=0; i < numElems; i++) { //if the item is a range, such as 3-5, //must split that as well. const QuickString &elem = groupColsTokens.getElem(i); if (strchr(elem.c_str(), '-')) { Tokenizer rangeElems; rangeElems.tokenize(elem, '-'); int startNum = str2chrPos(rangeElems.getElem(0)); int endNum = str2chrPos(rangeElems.getElem(1)); for (int i=startNum; i <= endNum; i++) { _groupCols.push_back(i); } } else { _groupCols.push_back(str2chrPos(elem)); } } _queryFRM = _context->getFile(0); _prevFields.resize(_groupCols.size()); _prevRecord = getNextRecord(); return true; }
void NewGenomeFile::loadGenomeFileIntoMap() { ifstream genFile(_genomeFileName.c_str()); if (!genFile.good()) { cerr << "Error: Can't open genome file" << _genomeFileName << "Exiting..." << endl; exit(1); } string sLine; Tokenizer fieldTokens; CHRPOS chrSize = 0; string chrName; while (!genFile.eof()) { sLine.clear(); chrSize = 0; chrName.clear(); getline(genFile, sLine); int numFields = fieldTokens.tokenize(sLine.c_str()); // allow use of .fai files. if (numFields < 2) { continue; } chrName = fieldTokens.getElem(0); chrSize = str2chrPos(fieldTokens.getElem(1)); _maxId++; _chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId); _startOffsets.push_back(_genomeLength); _genomeLength += chrSize; _chromList.push_back(chrName); } if (_maxId == -1) { cerr << "Error: The genome file " << _genomeFileName << " has no valid entries. Exiting." << endl; exit(1); } // Special: BAM files can have unmapped reads, which show as no chromosome, or an empty chrom string. // Add in an empty chrom so these don't error. _maxId++; _chromSizeIds[""] = pair<CHRPOS, int>(0, _maxId); _chromList.push_back(""); _startOffsets.push_back(_genomeLength); //insert the final length as the last element //to help with the lower_bound call in the projectOnGenome method. genFile.close(); }
void NewGenomeFile::loadGenomeFileIntoMap() { ifstream genFile(_genomeFileName.c_str()); if (!genFile.good()) { cerr << "Error: Can't open genome file" << _genomeFileName << "Exiting..." << endl; exit(1); } string sLine; Tokenizer fieldTokens; CHRPOS chrSize = 0; QuickString chrName; while (!genFile.eof()) { sLine.clear(); chrSize = 0; chrName.clear(); getline(genFile, sLine); int numFields = fieldTokens.tokenize(sLine.c_str()); if (numFields != 2) { continue; } chrName = fieldTokens.getElem(0); chrSize = str2chrPos(fieldTokens.getElem(1)); _maxId++; _chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId); _startOffsets.push_back(_genomeLength); _genomeLength += chrSize; _chromList.push_back(chrName); } if (_maxId == -1) { cerr << "Error: The genome file " << _genomeFileName << " has no valid entries. Exiting." << endl; exit(1); } _startOffsets.push_back(_genomeLength); //insert the final length as the last element //to help with the lower_bound call in the projectOnGenome method. genFile.close(); }
bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) { //get the strings from context containing the comma-delimited lists of columns //and operations. Split both of these into vectors. Get the operation code //for each operation string. Finally, make a vector of pairs, where the first //member of each pair is a column number, and the second member is the code for the //operation to perform on that column. Tokenizer colTokens; Tokenizer opsTokens; int numCols = colTokens.tokenize(_columns, ','); int numOps = opsTokens.tokenize(_operations, ','); if (numOps < 1 || numCols < 1) { cerr << endl << "*****" << endl << "***** ERROR: There must be at least one column and at least one operation named." << endl; return false; } if (numOps > 1 && numCols > 1 && numCols != numOps) { cerr << endl << "*****" << endl << "***** ERROR: There are " << numCols <<" columns given, but there are " << numOps << " operations." << endl; cerr << "\tPlease provide either a single operation that will be applied to all listed columns, " << endl; cerr << "\ta single column to which all operations will be applied," << endl; cerr << "\tor an operation for each column." << endl; return false; } int loop = max(numCols, numOps); // If there is only one column, all ops are performed on it. // Otherwise, if there is only op, it is performed on all columns. // Besides that, ops are performed on columns in their respective // ordering. for (int i=0; i < loop; i++) { int col = str2chrPos(colTokens.getElem(numCols > 1 ? i : 0)); //check that the column number is valid if (col < 1 || col > dbFile->getNumFields()) { cerr << endl << "*****" << endl << "***** ERROR: Requested column " << col << ", but database file " << dbFile->getFileName() << " only has fields 1 - " << dbFile->getNumFields() << "." << endl; return false; } const QuickString &operation = opsTokens.getElem(numOps > 1 ? i : 0); OP_TYPES opCode = getOpCode(operation); if (opCode == INVALID) { cerr << endl << "*****" << endl << "***** ERROR: " << operation << " is not a valid operation. " << endl; return false; } _colOps.push_back(pair<int, OP_TYPES>(col, opCode)); } //lastly, if the file is BAM, and they asked for column 2, which is the //flags field, then for now we have to throw an error, as the flag field //is currently not supported. if (_dbFileType == FileRecordTypeChecker::BAM_FILE_TYPE) { //also, tell the methods class we're dealing with BAM. _methods.setIsBam(true); for (size_t i = 0; i < _colOps.size(); i++) { if (_colOps[i].first == 2) { cerr << endl << "*****" << endl << "***** ERROR: Requested column 2 of a BAM file, which is the Flags field." << endl; cerr << " We currently do not support this, but may in future versions." << endl; return false; } } } return true; }