void createSimFile(string fname) { Sim *sim = new Sim(); hash_map<string,string>::iterator i = gtcHash.begin(); gtc.open(i->second, Gtc::INTENSITY); sim->createFile(fname); sim->writeHeader(gtcHash.size(), gtc.xRawIntensity.size()); // // // Process each GTC file in turn // unsigned int n=1; for (hash_map<string,string>::iterator i = gtcHash.begin(); i != gtcHash.end(); i++) { char *buffer; if (verbose) cout << timestamp() << "Processing GTC file " << n << " of " << gtcHash.size() << endl; // // add sample name to each output file // no family info as yet (todo?) - write sample ID twice // fn << i->first << endl; gtc.open(i->second,Gtc::XFORM | Gtc::INTENSITY); // reload GTC file to read XForm and Intensity arrays buffer = new char[sim->sampleNameSize]; memset(buffer,0,sim->sampleNameSize); // if we have a sample name from the json file, use it if (sampleNames.size() > (n-1)) { strcpy(buffer, sampleNames[n-1].c_str()); } else { strcpy(buffer,gtc.sampleName.c_str()); } sim->write(buffer, sim->sampleNameSize); for (unsigned int idx = 0; idx < gtc.xRawIntensity.size(); idx++) { uint16_t v; v = gtc.xRawIntensity[idx]; sim->write(&v,sizeof(v)); v = gtc.yRawIntensity[idx]; sim->write(&v,sizeof(v)); } n++; #if 0 for (vector<snpClass>::iterator snp = manifest->snps.begin(); snp != manifest->snps.end(); snp++) { if (excludeCnv && snp->name.find("cnv") != string::npos) continue; if (chrSelect.size() && chrSelect.compare(snp->chromosome)) continue; int idx = snp->index - 1; // index is zero based in arrays, but starts from 1 in the map file unsigned int norm = manifest->normIdMap[snp->normId]; XFormClass *XF = >c.XForm[norm]; // first do the normalisation calculation double tempx = gtc.xRawIntensity[idx] - XF->xOffset; double tempy = gtc.yRawIntensity[idx] - XF->yOffset; double cos_theta = cos(XF->theta); double sin_theta = sin(XF->theta); double tempx2 = cos_theta * tempx + sin_theta * tempy; double tempy2 = -sin_theta * tempx + cos_theta * tempy; double tempx3 = tempx2 - XF->shear * tempy2; double tempy3 = tempy2; double xn = tempx3 / XF->xScale; double yn = tempy3 / XF->yScale; // add raw/norm x/y to .raw and .nor files // fn << "\t" << std::fixed << setprecision(3) << xn << " " << yn; } #endif } sim->close(); }
// // Create a SIM file from one or more GTC files // // infile a file containing either a simple list of GTC files, or a list in JSON format // outfile the name of the SIM file to create, or '-' to write to stdout // normalize if true, normalize the intensities, else store the raw values in the SIM file // manfile the name of the manifest file // verbose boolean (default false) // // Note the the SIM file is written with the intensities sorted into position order, as given // by the manifest file. // void commandCreate(string infile, string outfile, bool normalize, string manfile, bool verbose) { vector<string> sampleNames; // list of sample names from JSON input file vector<string> infiles; // list of GTC files to process Sim *sim = new Sim(); Gtc *gtc = new Gtc(); Manifest *manifest = new Manifest(); int numberFormat = normalize ? 0 : 1; // // First, get a list of GTC files. and possibly sample names // if (infile == "") throw("commandCreate(): infile not specified"); parseInfile(infile,sampleNames,infiles); if (infiles.size() == 0) throw("No GTC files are specified in the infile"); // Let's check the GTC files, shall we? for (unsigned int n = 0; n < infiles.size(); n++) { gtc->open(infiles[n],0); if (gtc->errorMsg.length()) throw gtc->errorMsg; } // We need a manifest file to sort the SNPs and to normalise the intensities (if required) loadManifest(manifest, manfile); // Sort the SNPs into position order sort(manifest->snps.begin(), manifest->snps.end(), SortByPosition); // Create the SIM file and write the header sim->createFile(outfile); sim->writeHeader(infiles.size(),gtc->numSnps, 2, numberFormat); // For each GTC file, write the sample name and intensities to the SIM file for (unsigned int n = 0; n < infiles.size(); n++) { gtc->open(infiles[n], Gtc::XFORM | Gtc::INTENSITY); if (manifest->snps.size() != gtc->xRawIntensity.size()) { ostringstream msg; msg << "Size mismatch: Manifest contains " << manifest->snps.size() << " probes, but " << infiles[0] << " contains " << gtc->xRawIntensity.size() << " probes."; throw msg.str(); } char *buffer = new char[sim->sampleNameSize]; memset(buffer,0,sim->sampleNameSize); // if we have a sample name from the json file, use it if (n < sampleNames.size()) { strcpy(buffer, sampleNames[n].c_str()); } else { strcpy(buffer,gtc->sampleName.c_str()); } sim->write(buffer, sim->sampleNameSize); if (verbose) { cerr << "Gtc file " << n+1 << " of " << infiles.size() << " File: " << infiles[n] << " Sample: " << buffer << endl; } // Note that we write the intensities in SNP order, sorted by position for (vector<snpClass>::iterator snp = manifest->snps.begin(); snp != manifest->snps.end(); snp++) { double xn; double yn; int idx = snp->index - 1; // index is zero based in arrays, but starts from 1 in the map file if (normalize) { // This is the normalization calculation, according to Illumina unsigned int norm = manifest->normIdMap[snp->normId]; XFormClass *XF = &(gtc->XForm[norm]); double tempx = gtc->xRawIntensity[idx] - XF->xOffset; double tempy = gtc->yRawIntensity[idx] - XF->yOffset; double cos_theta = cos(XF->theta); double sin_theta = sin(XF->theta); double tempx2 = cos_theta * tempx + sin_theta * tempy; double tempy2 = -sin_theta * tempx + cos_theta * tempy; double tempx3 = tempx2 - XF->shear * tempy2; double tempy3 = tempy2; xn = tempx3 / XF->xScale; yn = tempy3 / XF->yScale; } else { xn = gtc->xRawIntensity[idx]; yn = gtc->yRawIntensity[idx]; } if (numberFormat == 0) { float v; v = xn; sim->write(&v,sizeof(v)); v = yn; sim->write(&v,sizeof(v)); } else { uint16_t v; v = xn; sim->write(&v,sizeof(v)); v = yn; sim->write(&v,sizeof(v)); } } } sim->close(); }