Exemplo n.º 1
0
void BlockMgr::getBlocksFromBed12(RecordKeyVector &keyList, bool &mustDelete)
{
	const Bed12Interval *keyRecord = static_cast<const Bed12Interval *>(keyList.getKey());
	int blockCount = keyRecord->getBlockCount();

    if ( blockCount <= 0 ) {
    	mustDelete = false;
    	return;
    }

    int sizeCount = _blockSizeTokens.tokenize(keyRecord->getBlockSizes(), ',');
    int startCount = _blockStartTokens.tokenize(keyRecord->getBlockStarts(), ',');

    if (blockCount != sizeCount || sizeCount != startCount) {
    	fprintf(stderr, "Error: found wrong block counts while splitting entry.\n");
    	exit(-1);
    }

    for (int i=0; i < blockCount; i++) {
    	int startPos = keyRecord->getStartPos() + str2chrPos(_blockStartTokens.getElem(i).c_str());
    	int endPos = startPos + str2chrPos(_blockSizeTokens.getElem(i).c_str());

    	Record *record = allocateAndAssignRecord(keyRecord, startPos, endPos);
    	keyList.push_back(record);
    }
    mustDelete = true;
}
Exemplo n.º 2
0
bool GroupBy::init()
{
	Tokenizer groupColsTokens;
	groupColsTokens.tokenize(upCast(_context)->getGroupCols(), ',');
	int numElems = groupColsTokens.getNumValidElems();
	for (int i=0; i < numElems; i++) {
		//if the item is a range, such as 3-5,
		//must split that as well.
		const QuickString &elem = groupColsTokens.getElem(i);

		if (strchr(elem.c_str(), '-')) {
			Tokenizer rangeElems;
			rangeElems.tokenize(elem, '-');
			int startNum = str2chrPos(rangeElems.getElem(0));
			int endNum = str2chrPos(rangeElems.getElem(1));
			for (int i=startNum; i <= endNum; i++) {
				_groupCols.push_back(i);
			}
		} else {
			_groupCols.push_back(str2chrPos(elem));
		}
	}
	_queryFRM = _context->getFile(0);
	_prevFields.resize(_groupCols.size());

	_prevRecord = getNextRecord();
	return true;
}
Exemplo n.º 3
0
bool GffRecord::initFromFile(SingleLineDelimTextFileReader *fileReader)
{
	setFileIdx(fileReader->getFileIdx());
	fileReader->getField(0, _chrName);
	fileReader->getField(3, _startPosStr);
	_startPos = str2chrPos(_startPosStr);
	_startPos--; // VCF is one-based. Here we intentionally don't decrement the string version,
	//because we'll still want to output the one-based number in the print methods, even though
	//internally we decrement the integer to comply with the 0-based format common to other records.
	fileReader->getField(4, _endPosStr);
	//endPos is just the startPos plus the length of the variant
	_endPos = str2chrPos(_endPosStr);

	fileReader->getField(2, _name);
	fileReader->getField(1, _source);
	fileReader->getField(5, _score);

	//GFF allows a '.' for the strandChar, signifying it is not known.
	fileReader->getField(6, _strand);
	adjustStrandVal();

	fileReader->getField(7, _frame);
	_numFields = fileReader->getNumFields();
	if (_numFields == 9) {
		fileReader->getField(8, _group);
	}


	return true;
}
Exemplo n.º 4
0
bool Bed3Interval::initFromFile(SingleLineDelimTextFileReader *fileReader)
{
	fileReader->getField(0, _chrName);
	fileReader->getField(1, _startPosStr);
	fileReader->getField(2, _endPosStr);
	_startPos = str2chrPos(_startPosStr);
	_endPos = str2chrPos(_endPosStr);
	return true;
}
Exemplo n.º 5
0
bool VcfRecord::initFromFile(SingleLineDelimTextFileReader *fileReader)
{
	setFileIdx(fileReader->getFileIdx());
	fileReader->getField(0, _chrName);
	_chrId = fileReader->getCurrChromdId();
	fileReader->getField(1, _startPosStr);
	_startPos = str2chrPos(_startPosStr);
	_startPos--; // VCF is one-based. Here we intentionally don't decrement the string version,
	//because we'll still want to output the one-based number in the print methods, even though
	//internally we decrement the integer to comply with the 0-based format common to other records.
	fileReader->getField(4, _varAlt);
	fileReader->getField(3, _varRef);
	if (_varAlt[0] == '<') {
		//this is a structural variant. Need to parse the tags to find the endpos.
		_endPos = _startPos + fileReader->getVcfSVlen();
	} else {
		//endPos is just the startPos plus the length of the variant
		_endPos = _startPos + _varRef.size();
	}
	int2str(_endPos, _endPosStr);
	fileReader->getField(2, _name);
	fileReader->getField(5, _score);

	return initOtherFieldsFromFile(fileReader);
}
Exemplo n.º 6
0
bool FileRecordTypeChecker::isGFFformat()
{
	//a GFF file may have 8 or 9 fields.
	if (_numFields < 7 || _numFields > 9) {
		return false;
	}
	//the 4th and 5th fields must be numeric.
	if (!isNumeric(_tokenizer.getElem(3)) || !isNumeric(_tokenizer.getElem(4))) {
		return false;
	}
	int start = str2chrPos(_tokenizer.getElem(3));
	int end = str2chrPos(_tokenizer.getElem(4));
	if (end < start) {
		return false;
	}
	return true;
}
Exemplo n.º 7
0
bool FileRecordTypeChecker::isBedFormat() {

	//test that the file has at least three fields.
	//2nd and 3rd fields of first valid data line must be integers. 3rd must not be less than 2nd.
	if (_numFields < 3) {
		return false;
	}
	//the 2nd and 3rd fields must be numeric.
	if (!isNumeric(_tokenizer.getElem(1)) || !isNumeric(_tokenizer.getElem(2))) {
		return false;
	}

	int start = str2chrPos(_tokenizer.getElem(1));
	int end = str2chrPos(_tokenizer.getElem(2));
	if (end < start) {
		return false;
	}
	return true;
}
CHRPOS SingleLineDelimTextFileReader::getVcfSVlen() {
	// tokenize the INFO field
	string info_str;
	vector<string> infofields;
	getField(VCF_TAG_FIELD, info_str);
	Tokenize(info_str, infofields, ';');

	// FOO=BAR;BIZ=BAM;SVLEN=100;END=200
	for (vector<string>::iterator f = infofields.begin(); f != infofields.end(); ++f) {
    	if (*f == ".") {
    		continue;
        }
        vector<string> keytoval;
        Tokenize(*f, keytoval, '=');
        //hey->val
        //SVLEN->100
        if (keytoval.size() == 2) {
        	if (keytoval.at(0) == "SVLEN") {
        		vector<CHRPOS> svlens;
        		Tokenize(keytoval.at(1), svlens, ',');
        		// are the multiple SVLENS?
        		if (svlens.size() == 1) {
        			return abs(svlens[0]);
        		}
        		else {
        			// return the abs_max SVLEN
        			CHRPOS max_len = *max_element(svlens.begin(),svlens.end(), abs_cmp);
        			return abs(max_len);
        		}
        	}
        	else if (keytoval.at(0) == "END") {
        		string start_str;
        		getField(1, start_str);
        		// length is END - POS + 1
        		return (int)(str2chrPos(keytoval.at(1)) - str2chrPos(start_str) + 1);
        	}
		}
    }
    // not found
    return INT_MIN;
}
Exemplo n.º 9
0
bool ContextMerge::handle_d() {
    if ((_i+1) < _argc) {
    	if (isNumeric(_argv[_i+1])) {
			int dist = str2chrPos(_argv[_i+1]);
			
			_maxDistance = dist;
	    	markUsed(_i - _skipFirstArgs);
	        _i++;
	        markUsed(_i - _skipFirstArgs);
			return true;
    	}
    }
	_errorMsg = "\n***** ERROR: -d option must be followed by an integer value *****";
	return false;
}
Exemplo n.º 10
0
void NewGenomeFile::loadGenomeFileIntoMap() {


	ifstream genFile(_genomeFileName.c_str());
	if (!genFile.good()) {
		cerr << "Error: Can't open genome file" << _genomeFileName << "Exiting..." << endl;
		exit(1);
	}
	string sLine;
	Tokenizer fieldTokens;
	CHRPOS chrSize = 0;
	string chrName;
	while (!genFile.eof()) {
		sLine.clear();
		chrSize = 0;
		chrName.clear();
		getline(genFile, sLine);
		int numFields = fieldTokens.tokenize(sLine.c_str());
		// allow use of .fai files.
		 if (numFields < 2) {
		 	continue;
		 }
		chrName = fieldTokens.getElem(0);
		chrSize = str2chrPos(fieldTokens.getElem(1));
		_maxId++;
		_chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId);
		_startOffsets.push_back(_genomeLength);
		_genomeLength += chrSize;
		_chromList.push_back(chrName);
	}
	if (_maxId == -1) {
		cerr << "Error: The genome file " << _genomeFileName << " has no valid entries. Exiting." << endl;
		exit(1);
	}
	// Special: BAM files can have unmapped reads, which show as no chromosome, or an empty chrom string.
	// Add in an empty chrom so these don't error.
	_maxId++;
	_chromSizeIds[""] = pair<CHRPOS, int>(0, _maxId);
	_chromList.push_back("");


	_startOffsets.push_back(_genomeLength); //insert the final length as the last element
	//to help with the lower_bound call in the projectOnGenome method.
	genFile.close();
}
Exemplo n.º 11
0
void NewGenomeFile::loadGenomeFileIntoMap() {


	ifstream genFile(_genomeFileName.c_str());
	if (!genFile.good()) {
		cerr << "Error: Can't open genome file" << _genomeFileName << "Exiting..." << endl;
		exit(1);
	}
	string sLine;
	Tokenizer fieldTokens;
	CHRPOS chrSize = 0;
	QuickString chrName;
	while (!genFile.eof()) {
		sLine.clear();
		chrSize = 0;
		chrName.clear();
		getline(genFile, sLine);
		int numFields = fieldTokens.tokenize(sLine.c_str());
		if (numFields != 2) {
			continue;
		}
		chrName = fieldTokens.getElem(0);
		chrSize = str2chrPos(fieldTokens.getElem(1));
		_maxId++;
		_chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId);
		_startOffsets.push_back(_genomeLength);
		_genomeLength += chrSize;
		_chromList.push_back(chrName);
	}
	if (_maxId == -1) {
		cerr << "Error: The genome file " << _genomeFileName << " has no valid entries. Exiting." << endl;
		exit(1);
	}
	_startOffsets.push_back(_genomeLength); //insert the final length as the last element
	//to help with the lower_bound call in the projectOnGenome method.
	genFile.close();
}
Exemplo n.º 12
0
bool ContextBase::parseIoBufSize(string bufStr)
{
	char lastChar = bufStr[bufStr.size()-1];
	int multiplier = 1;
	if (!isdigit(lastChar)) {
		switch (lastChar) {
		case 'K':
			multiplier = 1 << 10;
			break;
		case 'M':
			multiplier = 1 << 20;
			break;
		case 'G':
			multiplier = 1 << 30;
			break;
		default:
			_errorMsg = "\n***** ERROR: Unrecognized memory buffer size suffix \'";
			_errorMsg += lastChar;
			_errorMsg += "\' given. *****";
			return false;
			break;
		}
		//lop off suffix character
		bufStr.resize(bufStr.size()-1);
	}
	if (!isNumeric(bufStr)) {
		_errorMsg = "\n***** ERROR: argument passed to -iobuf is not numeric. *****";
		return false;
	}
	_ioBufSize = str2chrPos(bufStr) * multiplier;
	if (_ioBufSize < MIN_ALLOWED_BUF_SIZE) {
		_errorMsg = "\n***** ERROR: specified buffer size is too small. *****";
		return false;
	}
	return true;
}
void SingleLineDelimTextFileReader::getField(int fieldNum, int &val) {
	getField(fieldNum, _tempChrPosStr);
	val = str2chrPos(_tempChrPosStr.c_str());
}
void SingleLineDelimTextFileReader::getField(int fieldNum, CHRPOS &val) const {
	string temp;
	getField(fieldNum, temp);
	val = str2chrPos(temp.c_str());
}
Exemplo n.º 15
0
bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) {

	//get the strings from context containing the comma-delimited lists of columns
	//and operations. Split both of these into vectors. Get the operation code
	//for each operation string. Finally, make a vector of pairs, where the first
	//member of each pair is a column number, and the second member is the code for the
	//operation to perform on that column.

    Tokenizer colTokens;
    Tokenizer opsTokens;

    int numCols = colTokens.tokenize(_columns, ',');
	int numOps = opsTokens.tokenize(_operations, ',');

	if (numOps < 1 || numCols < 1) {
		 cerr << endl << "*****" << endl
		             << "***** ERROR: There must be at least one column and at least one operation named." << endl;
		 return false;
	}
	if (numOps > 1 && numCols > 1 && numCols != numOps) {
		 cerr << endl << "*****" << endl
		             << "***** ERROR: There are " << numCols <<" columns given, but there are " << numOps << " operations." << endl;
		cerr << "\tPlease provide either a single operation that will be applied to all listed columns, " << endl;
		cerr << "\ta single column to which all operations will be applied," << endl;
		cerr << "\tor an operation for each column." << endl;
		return false;
	}
	int loop = max(numCols, numOps);

	// If there is only one column, all ops are performed on it.
	// Otherwise, if there is only op, it is performed on all columns.
	// Besides that, ops are performed on columns in their respective
	// ordering.

	for (int i=0; i < loop; i++) {
		int col = str2chrPos(colTokens.getElem(numCols > 1 ? i : 0));

		//check that the column number is valid
		if (col < 1 || col > dbFile->getNumFields()) {
			 cerr << endl << "*****" << endl  << "***** ERROR: Requested column " << col << ", but database file "
					 << dbFile->getFileName() << " only has fields 1 - " << dbFile->getNumFields() << "." << endl;
			 return false;
		}
		const QuickString &operation = opsTokens.getElem(numOps > 1 ? i : 0);
		OP_TYPES opCode = getOpCode(operation);
		if (opCode == INVALID) {
			cerr << endl << "*****" << endl
								 << "***** ERROR: " << operation << " is not a valid operation. " << endl;
			return false;
		}
		_colOps.push_back(pair<int, OP_TYPES>(col, opCode));
	}

	//lastly, if the file is BAM, and they asked for column 2, which is the
	//flags field, then for now we have to throw an error, as the flag field
	//is currently not supported.
	if (_dbFileType == FileRecordTypeChecker::BAM_FILE_TYPE) {
		//also, tell the methods class we're dealing with BAM.
		_methods.setIsBam(true);
		for (size_t i = 0; i < _colOps.size(); i++) {
			if (_colOps[i].first == 2) {
				cerr << endl << "*****" << endl << "***** ERROR: Requested column 2 of a BAM file, which is the Flags field." << endl;
				cerr << "             We currently do not support this, but may in future versions." << endl;
				return false;
			}
		}
	}

    return true;
}
Exemplo n.º 16
0
int str2chrPos(const string &str) {
	return str2chrPos(str.c_str(), str.size());
}