예제 #1
0
파일: g2i.cpp 프로젝트: dkj/simtools
//
// We've read the Manifest and all the GTC files
// Now it's time to create the output files
//
void goForIt(string fname)
{
	int recordLength = gtcHash.size() * 10 * 2;
	if (binary) recordLength = gtcHash.size() * sizeof(float) * 2;
	char *buffer = new char[recordLength];
	if (binary) {
		memset(buffer,0,recordLength);
	} else {
		memset(buffer,' ',recordLength);
		buffer[recordLength-1] = '\n';
	}

	// Sort the SNPs into position order
	sort(manifest->snps.begin(), manifest->snps.end(), SortByPosition);

	// Create lockfile
	string lockFileName = fname + ".lock";
	FILE *lockfile = fopen(lockFileName.c_str(), "w");
	if (!lockfile) {
		cerr << "Can't create lock file " << lockFileName << endl;
		cerr << strerror(errno) << endl;
		exit(1);
	}
	fclose(lockfile);

	//
	// Create all of the output files - one for each chromosome
	//
	for (vector<snpClass>::iterator snp = manifest->snps.begin(); snp != manifest->snps.end(); snp++) {
		if (excludeCnv && snp->name.find("cnv") != string::npos) continue;
		if (chrSelect.size() && chrSelect.compare(snp->chromosome)) continue;
		fstream *f = outFile[snp->chromosome];
		if (!f) {
			f = new fstream();
			string fullFname = fname + "_intu_" + snp->chromosome + ".txt";
			filenameArray.push_back("_intu_" + snp->chromosome + ".txt");
			if (verbose) cout << timestamp() << "creating file " << fullFname << endl;
			if (binary) f->open(fullFname.c_str(), ios::in | ios::out | ios::trunc | ios::binary);
			else        f->open(fullFname.c_str(), ios::in | ios::out | ios::trunc);

			*f << "SNP\tCoor\tAlleles";
			// write sample names from all the gtc files
			for (hash_map<string,string>::iterator i = gtcHash.begin(); i != gtcHash.end(); i++) {
				*f << "\t" << i->first << "A\t" << i->first << "B";
			}
			*f << endl;
			// associate the file handle with the chromosome
			outFile[snp->chromosome] = f;
		}
		f = outFile[snp->chromosome];
		*f << snp->name << "\t" << snp->position << "\t" << snp->snp[0] << snp->snp[1];
		filePos[snp->name] = f->tellp();	// store next position to write
		f->write(buffer,recordLength);	// fill with nulls (or spaces)

		cache[snp->name] = new float[CACHESIZE];
	}

	//
	// Process each GTC file in turn
	//
	int n=1;
	int cacheIndex = 0;
	for (hash_map<string,string>::iterator i = gtcHash.begin(); i != gtcHash.end(); i++) {
		if (verbose) cout << timestamp() << "Processing GTC file " << n++ << " of " << gtcHash.size() << endl;
		gtc.open(i->second,Gtc::XFORM | Gtc::INTENSITY);	// reload GTC file to read XForm and Intensity arrays

		for (vector<snpClass>::iterator snp = manifest->snps.begin(); snp != manifest->snps.end(); snp++) {
			if (excludeCnv && snp->name.find("cnv") != string::npos) continue;
			if (chrSelect.size() && chrSelect.compare(snp->chromosome)) continue;
			int idx = snp->index - 1;	// index is zero based in arrays, but starts from 1 in the map file
			unsigned int norm = manifest->normIdMap[snp->normId];
			XFormClass *XF = &gtc.XForm[norm];

			double xn, yn;
			if (normalise) {
				// first do the normalisation calculation
				double tempx = gtc.xRawIntensity[idx] - XF->xOffset;
				double tempy = gtc.yRawIntensity[idx] - XF->yOffset;

				double cos_theta = cos(XF->theta);
				double sin_theta = sin(XF->theta);
				double tempx2 = cos_theta * tempx + sin_theta * tempy;
				double tempy2 = -sin_theta * tempx + cos_theta * tempy;

				double tempx3 = tempx2 - XF->shear * tempy2;
				double tempy3 = tempy2;

				xn = tempx3 / XF->xScale;
				yn = tempy3 / XF->yScale;
			} else {
				xn = gtc.xRawIntensity[idx];
				yn = gtc.yRawIntensity[idx];
			}

			cache[snp->name][cacheIndex] = xn;
			cache[snp->name][cacheIndex+1] = yn;
		}
		cacheIndex += 2;
		if (cacheIndex == CACHESIZE) { flushCache(cacheIndex); cacheIndex=0; }
	}

	flushCache(cacheIndex);

	// close all of the files
	for (pos = outFile.begin(); pos != outFile.end(); pos++) {
		pos->second->close();
	}

	// delete lockfile and create donefile
	string doneFileName = lockFileName;
	string::size_type dot = doneFileName.find(".lock");
	doneFileName.replace(dot, 5, ".g2i");
	rename(lockFileName.c_str(), doneFileName.c_str());
	if (verbose) cout << timestamp() << "Renamed " << lockFileName << " to " << doneFileName << endl;
}
예제 #2
0
파일: g2i.cpp 프로젝트: dkj/simtools
void createSimFile(string fname)
{
	Sim *sim = new Sim();

	hash_map<string,string>::iterator i = gtcHash.begin();
	gtc.open(i->second, Gtc::INTENSITY);
	sim->createFile(fname);
	sim->writeHeader(gtcHash.size(), gtc.xRawIntensity.size());

	//
	//
	// Process each GTC file in turn
	//
	unsigned int n=1;
	for (hash_map<string,string>::iterator i = gtcHash.begin(); i != gtcHash.end(); i++) {
		char *buffer;
		if (verbose) cout << timestamp() << "Processing GTC file " << n << " of " << gtcHash.size() << endl;
		//
		// add sample name to each output file
		// no family info as yet (todo?) - write sample ID twice
//		fn << i->first << endl;

		gtc.open(i->second,Gtc::XFORM | Gtc::INTENSITY);	// reload GTC file to read XForm and Intensity arrays
		buffer = new char[sim->sampleNameSize];
		memset(buffer,0,sim->sampleNameSize);
		// if we have a sample name from the json file, use it
		if (sampleNames.size() > (n-1)) { strcpy(buffer, sampleNames[n-1].c_str()); }
		else                            { strcpy(buffer,gtc.sampleName.c_str()); }
		sim->write(buffer, sim->sampleNameSize);

		for (unsigned int idx = 0; idx < gtc.xRawIntensity.size(); idx++) {
			uint16_t v;
			v = gtc.xRawIntensity[idx];
			sim->write(&v,sizeof(v));
			v = gtc.yRawIntensity[idx];
			sim->write(&v,sizeof(v));
		}
		n++;

#if 0
		for (vector<snpClass>::iterator snp = manifest->snps.begin(); snp != manifest->snps.end(); snp++) {
			if (excludeCnv && snp->name.find("cnv") != string::npos) continue;
			if (chrSelect.size() && chrSelect.compare(snp->chromosome)) continue;
			int idx = snp->index - 1;	// index is zero based in arrays, but starts from 1 in the map file
			unsigned int norm = manifest->normIdMap[snp->normId];
			XFormClass *XF = &gtc.XForm[norm];

			// first do the normalisation calculation
			double tempx = gtc.xRawIntensity[idx] - XF->xOffset;
			double tempy = gtc.yRawIntensity[idx] - XF->yOffset;

			double cos_theta = cos(XF->theta);
			double sin_theta = sin(XF->theta);
			double tempx2 = cos_theta * tempx + sin_theta * tempy;
			double tempy2 = -sin_theta * tempx + cos_theta * tempy;

			double tempx3 = tempx2 - XF->shear * tempy2;
			double tempy3 = tempy2;

			double xn = tempx3 / XF->xScale;
			double yn = tempy3 / XF->yScale;

			// add raw/norm x/y to .raw and .nor files
//			fn << "\t" << std::fixed << setprecision(3) << xn << " " << yn;
		}
#endif

	}
	sim->close();
}
예제 #3
0
파일: simtools.cpp 프로젝트: dkj/simtools
//
// Create a SIM file from one or more GTC files
//
// infile      a file containing either a simple list of GTC files, or a list in JSON format
// outfile     the name of the SIM file to create, or '-' to write to stdout
// normalize   if true, normalize the intensities, else store the raw values in the SIM file
// manfile     the name of the manifest file
// verbose     boolean (default false)
//
// Note the the SIM file is written with the intensities sorted into position order, as given
// by the manifest file.
//
void commandCreate(string infile, string outfile, bool normalize, string manfile, bool verbose)
{
	vector<string> sampleNames;		// list of sample names from JSON input file
	vector<string> infiles;			// list of GTC files to process
	Sim *sim = new Sim();
	Gtc *gtc = new Gtc();
	Manifest *manifest = new Manifest();
	int numberFormat = normalize ? 0 : 1;

	//
	// First, get a list of GTC files. and possibly sample names
	//
	if (infile == "") throw("commandCreate(): infile not specified");

	parseInfile(infile,sampleNames,infiles);
	if (infiles.size() == 0) throw("No GTC files are specified in the infile");

	// Let's check the GTC files, shall we?
	for (unsigned int n = 0; n < infiles.size(); n++) {
		gtc->open(infiles[n],0);
		if (gtc->errorMsg.length()) throw gtc->errorMsg;
	}

	// We need a manifest file to sort the SNPs and to normalise the intensities (if required)
	loadManifest(manifest, manfile);
	// Sort the SNPs into position order
	sort(manifest->snps.begin(), manifest->snps.end(), SortByPosition);

	// Create the SIM file and write the header
	sim->createFile(outfile);
	sim->writeHeader(infiles.size(),gtc->numSnps, 2, numberFormat);

	// For each GTC file, write the sample name and intensities to the SIM file
	for (unsigned int n = 0; n < infiles.size(); n++) {
		gtc->open(infiles[n], Gtc::XFORM | Gtc::INTENSITY);
		if (manifest->snps.size() != gtc->xRawIntensity.size()) {
			ostringstream msg;
			msg << "Size mismatch: Manifest contains " << manifest->snps.size() << " probes, but " 
			    << infiles[0] << " contains " << gtc->xRawIntensity.size() << " probes.";
			throw msg.str();
		}
		char *buffer = new char[sim->sampleNameSize];
		memset(buffer,0,sim->sampleNameSize);
		// if we have a sample name from the json file, use it
		if (n < sampleNames.size()) { strcpy(buffer, sampleNames[n].c_str()); }
		else                        { strcpy(buffer,gtc->sampleName.c_str()); }
		sim->write(buffer, sim->sampleNameSize);
		if (verbose) {
			cerr << "Gtc file " 
		         << n+1
			     << " of " 
			     << infiles.size()
			     << "  File: "
			     << infiles[n]
			     << "  Sample: "
			     << buffer
			     << endl;
		}
		// Note that we write the intensities in SNP order, sorted by position
		for (vector<snpClass>::iterator snp = manifest->snps.begin(); snp != manifest->snps.end(); snp++) {
			double xn;
			double yn;
			int idx = snp->index - 1;   // index is zero based in arrays, but starts from 1 in the map file
			if (normalize) {
				// This is the normalization calculation, according to Illumina
				unsigned int norm = manifest->normIdMap[snp->normId];
				XFormClass *XF = &(gtc->XForm[norm]);
				double tempx = gtc->xRawIntensity[idx] - XF->xOffset;
				double tempy = gtc->yRawIntensity[idx] - XF->yOffset;
				double cos_theta = cos(XF->theta);
				double sin_theta = sin(XF->theta);
				double tempx2 = cos_theta * tempx + sin_theta * tempy;
				double tempy2 = -sin_theta * tempx + cos_theta * tempy;
				double tempx3 = tempx2 - XF->shear * tempy2;
				double tempy3 = tempy2;
				xn = tempx3 / XF->xScale;
				yn = tempy3 / XF->yScale;
			} else {
				xn = gtc->xRawIntensity[idx];
				yn = gtc->yRawIntensity[idx];
			}
			if (numberFormat == 0) {
				float v;
				v = xn; sim->write(&v,sizeof(v));
				v = yn; sim->write(&v,sizeof(v));
			} else {
				uint16_t v;
				v = xn; sim->write(&v,sizeof(v));
				v = yn; sim->write(&v,sizeof(v));
			}
		}

	}

	sim->close();
}