Exemple #1
0
void calcExpectedEffectiveLengths(ModelType& model) {
  int lb, ub, span;
  double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
  
  model.getGLD().copyTo(pdf, cdf, lb, ub, span);
  clen = new double[span + 1];
  clen[0] = 0.0;
  for (int i = 1; i <= span; i++) {
    clen[i] = clen[i - 1] + pdf[i] * (lb + i);
  }

  eel.clear();
  eel.resize(M + 1, 0.0);
  for (int i = 1; i <= M; i++) {
    int totLen = refs.getRef(i).getTotLen();
    int fullLen = refs.getRef(i).getFullLen();
    int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
    int pos2 = max(min(totLen, ub) - lb, 0);

    if (pos2 == 0) { eel[i] = 0.0; continue; }
    
    eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
    assert(eel[i] >= 0);
    if (eel[i] < MINEEL) { eel[i] = 0.0; }
  }
  
  delete[] pdf;
  delete[] cdf;
  delete[] clen;
}
inline void PROBerReadModel::update(InMemAlignG* ag_in_mem, InMemAlign* aligns, AlignmentGroup& ag, double noise_frac) {
	SEQstring seq;
	QUALstring qual;
	CIGARstring cigar;
	const RefSeq* refseq = NULL;
	
	assert(ag.getSEQ(seq));
	if (model_type & 1) assert(ag.getQUAL(qual));
	// update noise prob
	npro->update(seq, noise_frac);
	// update alignment probs
	for (int i = 0; i < ag_in_mem->size; ++i) if (aligns[i].frac > 0.0) {
		refseq = refs->getRef(aligns[i].tid);
		assert(ag.getAlignment(i)->getCIGAR(cigar));
		seqmodel->update(aligns[i].frac, '+', aligns[i].pos, refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL));
	}

	if (model_type >= 2) {
		// paired-end reads
		assert(ag.getSEQ(seq, 2));
		if (model_type & 1) assert(ag.getQUAL(qual, 2));
		// update noise prob
		npro->update(seq, noise_frac);
		// update alignment probs
		for (int i = 0; i < ag_in_mem->size; ++i) if (aligns[i].frac > 0.0) {
			refseq = refs->getRef(aligns[i].tid);
			assert(ag.getAlignment(i)->getCIGAR(cigar, 2));
			assert(aligns[i].fragment_length > 0);
			seqmodel->update(aligns[i].frac, '-', refseq->getLen() - aligns[i].pos - aligns[i].fragment_length, refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL));
		}
	}
} 
inline void PROBerReadModel::setConProbs(InMemAlignG* ag_in_mem, InMemAlign* aligns, AlignmentGroup& ag) {
	int seqlen;
	SEQstring seq; // seq, qual and cigar must be in each function since we have multiple threads!
	QUALstring qual;
	CIGARstring cigar;
	const RefSeq* refseq = NULL;
	
	// Get read sequences and quality scores
	assert(ag.getSEQ(seq));
	seqlen = read_length < 0 ? ag.getSeqLength() : read_length;
	if (model_type & 1) assert(ag.getQUAL(qual));
	// set noise probability    
	ag_in_mem->noise_conprb = mld1->getProb(seqlen) * npro->getProb(seq);
	// set alignment probabilities
	for (int i = 0; i < ag_in_mem->size; ++i) if (aligns[i].conprb != -1.0) {
		refseq = refs->getRef(aligns[i].tid);
		assert(ag.getAlignment(i)->getCIGAR(cigar));
		aligns[i].conprb = (aligns[i].fragment_length > 0 ? mld1->getProb(seqlen, aligns[i].fragment_length) : mld1->getProb(seqlen)) * \
			seqmodel->getProb('+', aligns[i].pos, refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL));
	}
	
	if (model_type >= 2) {
		// paired-end reads
		assert(ag.getSEQ(seq, 2));
		seqlen = read_length < 0 ? ag.getSeqLength(2) : read_length;
		if (model_type & 1) assert(ag.getQUAL(qual, 2));
		ag_in_mem->noise_conprb *= mld2->getProb(seqlen) * npro->getProb(seq);
		for (int i = 0; i < ag_in_mem->size; ++i) if (aligns[i].conprb != -1.0) {
			refseq = refs->getRef(aligns[i].tid);
			assert(ag.getAlignment(i)->getCIGAR(cigar, 2));
			assert(aligns[i].fragment_length > 0);
			aligns[i].conprb *= mld2->getProb(seqlen, aligns[i].fragment_length) * \
	seqmodel->getProb('-', refseq->getLen() - aligns[i].pos - aligns[i].fragment_length, refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL));
		}
	}
}
inline void PROBerReadModel::simulate(READ_INT_TYPE rid, int tid, int pos, int fragment_length, std::ofstream* out1, std::ofstream* out2) {
	int m2pos;
	int mateL1, mateL2;
	std::string qual1, qual2, cigar1, cigar2, readseq1, readseq2;
	
	// Simulate reads
	if (tid == 0) {
		cigar1 = cigar2 = "*";

		mateL1 = mld1->simulate(sampler);
		if (model_type & 1) qd->simulate(sampler, mateL1, qual1);
		npro->simulate(sampler, mateL1, readseq1);
		
		if (model_type >= 2) {
			mateL2 = mld2->simulate(sampler);
			if (model_type & 1) qd->simulate(sampler, mateL2, qual2);
			npro->simulate(sampler, mateL2, readseq2);
		}
	}
	else {
		const RefSeq* ref = refs->getRef(tid);
		mateL1 = mld1->simulate(sampler, fragment_length);
		if (model_type & 1) qd->simulate(sampler, mateL1, qual1);
		seqmodel->simulate(sampler, mateL1, '+', pos, ref, qual1, cigar1, readseq1);

		if (model_type >= 2) {
			mateL2 = mld2->simulate(sampler, fragment_length); 
			if (model_type & 1) qd->simulate(sampler, mateL2, qual2);
			m2pos = ref->getLen() - pos - fragment_length;
			seqmodel->simulate(sampler, mateL2, '-', m2pos, ref, qual2, cigar2, readseq2);
		}
	}

	// Output reads
	(*out1)<< ((model_type & 1) ? '@' : '>') << rid<< '_'<< tid<< '_'<< pos<< '_'<< fragment_length<< '_'<< cigar1<< (model_type >= 2 ? "/1" : "") << std::endl;
	(*out1)<< readseq1<< std::endl;
	if (model_type & 1) (*out1)<< '+'<< std::endl<< qual1<< std::endl;

	if (model_type >= 2) {
		(*out2)<< ((model_type & 1) ? '@' : '>') << rid<< '_'<< tid<< '_'<< pos<< '_'<< fragment_length<< '_'<< cigar2<< "/2"<< std::endl;
		(*out2)<< readseq2<< std::endl;
		if (model_type & 1) (*out2)<< '+'<< std::endl<< qual2<< std::endl;
	}
}
Exemple #5
0
void parr_update_duplicates(Ob ** obs, size_t obs_N, PDUP_t dupT, Refs& new_dups)
{
    REL_t pHUB = REL_PARENT;
    REL_t pEXA = REL_CHILD;

    cmp_function_t *cmp = nullptr;

    // determine \c cmp function (functional argument)
    switch (dupT) {
    case PDUP_NAME:
        cmp = voidp_ob_cmp_name;
        break;
    case PDUP_CONTENT:
        cmp = voidp_ob_cmp_std;
        break;
    default: PTODO("Cannot handle dupT:%d (yet)\n", dupT); return; break;
    }

    qsort_mt(obs, obs_N, sizeof(Ob*), cmp, 0,0);

    for (size_t i = 0; i < obs_N;) { // for all obs
        size_t j = i+1;
        // iterate j to beyond first non-duplicate
        for (; j < obs_N; j++) {	// for all obs following obs[i]
            if (cmp(&obs[i], &obs[j]) != 0) { break; }
        }
        const size_t dupN = j-i;    // multiplicity (2 or more for duplicates)
        if (dupN >= 2) {		// obsj[i ... i+dupN-1] are duplicates
            Dup * pdup = gen::dup(dupT);
            new_dups.app(pdup);
            for (size_t k = 0; k < dupN; k++) { // for all duplicates
                obs[i+k]->net_disconnectM(pHUB); // disconnect from all previous
                net_connectS(pEXA, obs[i+k], // connect to new
                             pHUB, pdup,
                             true);

            }
        } else {			// obs[i] has no duplicates
            obs[i]->net_disconnectM(pHUB); // disconnect from all previous
        }
        i = j;
    }
}
Exemple #6
0
void init(ReadReader<ReadType> **&readers, HitContainer<HitType> **&hitvs, double **&ncpvs, ModelType **&mhps) {
	READ_INT_TYPE nReads;
	HIT_INT_TYPE nHits;
	int rt; // read type

	READ_INT_TYPE nrLeft, curnr; // nrLeft : number of reads left, curnr: current number of reads
	HIT_INT_TYPE nhT; // nhT : hit threshold per thread
	char datF[STRLEN];

	int s;
	char readFs[2][STRLEN];
	ReadIndex *indices[2];
	ifstream fin;

	readers = new ReadReader<ReadType>*[nThreads];
	genReadFileNames(imdName, 1, read_type, s, readFs);
	for (int i = 0; i < s; i++) {
		indices[i] = new ReadIndex(readFs[i]);
	}
	for (int i = 0; i < nThreads; i++) {
		readers[i] = new ReadReader<ReadType>(s, readFs, refs.hasPolyA(), mparams.seedLen); // allow calculation of calc_lq() function
		readers[i]->setIndices(indices);
	}

	hitvs = new HitContainer<HitType>*[nThreads];
	for (int i = 0; i < nThreads; i++) {
		hitvs[i] = new HitContainer<HitType>();
	}

	sprintf(datF, "%s.dat", imdName);
	fin.open(datF);
	general_assert(fin.is_open(), "Cannot open " + cstrtos(datF) + "! It may not exist.");
	fin>>nReads>>nHits>>rt;
	general_assert(nReads == N1, "Number of alignable reads does not match!");
	general_assert(rt == read_type, "Data file (.dat) does not have the right read type!");


	//A just so so strategy for paralleling
	nhT = nHits / nThreads;
	nrLeft = N1;
	curnr = 0;

	ncpvs = new double*[nThreads];
	for (int i = 0; i < nThreads; i++) {
		HIT_INT_TYPE ntLeft = nThreads - i - 1; // # of threads left

		general_assert(readers[i]->locate(curnr), "Read indices files do not match!");

		while (nrLeft > ntLeft && (i == nThreads - 1 || hitvs[i]->getNHits() < nhT)) {
			general_assert(hitvs[i]->read(fin), "Cannot read alignments from .dat file!");

			--nrLeft;
			if (verbose && nrLeft % 1000000 == 0) { cout<< "DAT "<< nrLeft << " reads left"<< endl; }
		}
		ncpvs[i] = new double[hitvs[i]->getN()];
		memset(ncpvs[i], 0, sizeof(double) * hitvs[i]->getN());
		curnr += hitvs[i]->getN();

		if (verbose) { cout<<"Thread "<< i<< " : N = "<< hitvs[i]->getN()<< ", NHit = "<< hitvs[i]->getNHits()<< endl; }
	}

	fin.close();

	mhps = new ModelType*[nThreads];
	for (int i = 0; i < nThreads; i++) {
		mhps[i] = new ModelType(mparams, false); // just model helper
	}

	probv = new double[M + 1];
	countvs = new double*[nThreads];
	for (int i = 0; i < nThreads; i++) {
		countvs[i] = new double[M + 1];
	}


	if (verbose) { printf("EM_init finished!\n"); }
}
Exemple #7
0
int main(int argc, char* argv[]) {
	ifstream fin;
	bool quiet = false;

	if (argc < 5) {
		printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n");
		printf("  refName: reference name\n");
		printf("  read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
		printf("  sampleName: sample's name, including the path\n");
		printf("  sampleToken: sampleName excludes the path\n");
		printf("  -p: number of threads which user wants to use. (default: 1)\n");
		printf("  -b: produce bam format output file. (default: off)\n");
		printf("  -q: set it quiet\n");
		printf("  --gibbs-out: generate output file used by Gibbs sampler. (default: off)\n");
		printf("  --sampling: sample each read from its posterior distribution when bam file is generated. (default: off)\n");
		printf("// model parameters should be in imdName.mparams.\n");
		exit(-1);
	}

	time_t a = time(NULL);

	strcpy(refName, argv[1]);
	read_type = atoi(argv[2]);
	strcpy(outName, argv[3]);
	sprintf(imdName, "%s.temp/%s", argv[3], argv[4]);
	sprintf(statName, "%s.stat/%s", argv[3], argv[4]);

	nThreads = 1;

	genBamF = false;
	bamSampling = false;
	genGibbsOut = false;
	pt_fn_list = pt_chr_list = NULL;

	for (int i = 5; i < argc; i++) {
		if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); }
		if (!strcmp(argv[i], "-b")) {
			genBamF = true;
			inpSamType = argv[i + 1][0];
			strcpy(inpSamF, argv[i + 2]);
			if (atoi(argv[i + 3]) == 1) {
				strcpy(fn_list, argv[i + 4]);
				pt_fn_list = (char*)(&fn_list);
			}
		}
		if (!strcmp(argv[i], "-q")) { quiet = true; }
		if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; }
		if (!strcmp(argv[i], "--sampling")) { bamSampling = true; }
	}

	general_assert(nThreads > 0, "Number of threads should be bigger than 0!");

	verbose = !quiet;

	//basic info loading
	sprintf(refF, "%s.seq", refName);
	refs.loadRefs(refF);
	M = refs.getM();
	sprintf(groupF, "%s.grp", refName);
	gi.load(groupF);
	m = gi.getm();

	sprintf(tiF, "%s.ti", refName);
	transcripts.readFrom(tiF);

	sprintf(cntF, "%s.cnt", statName);
	fin.open(cntF);

	general_assert(fin.is_open(), "Cannot open " + cstrtos(cntF) + "! It may not exist.");

	fin>>N0>>N1>>N2>>N_tot;
	fin.close();

	general_assert(N1 > 0, "There are no alignable reads!");

	if ((READ_INT_TYPE)nThreads > N1) nThreads = N1;

	//set model parameters
	mparams.M = M;
	mparams.N[0] = N0; mparams.N[1] = N1; mparams.N[2] = N2;
	mparams.refs = &refs;

	sprintf(mparamsF, "%s.mparams", imdName);
	fin.open(mparamsF);

	general_assert(fin.is_open(), "Cannot open " + cstrtos(mparamsF) + "It may not exist.");

	fin>> mparams.minL>> mparams.maxL>> mparams.probF;
	int val; // 0 or 1 , for estRSPD
	fin>>val;
	mparams.estRSPD = (val != 0);
	fin>> mparams.B>> mparams.mate_minL>> mparams.mate_maxL>> mparams.mean>> mparams.sd;
	fin>> mparams.seedLen;
	fin.close();

	//run EM
	switch(read_type) {
	case 0 : EM<SingleRead, SingleHit, SingleModel>(); break;
	case 1 : EM<SingleReadQ, SingleHit, SingleQModel>(); break;
	case 2 : EM<PairedEndRead, PairedEndHit, PairedEndModel>(); break;
	case 3 : EM<PairedEndReadQ, PairedEndHit, PairedEndQModel>(); break;
	default : fprintf(stderr, "Unknown Read Type!\n"); exit(-1);
	}

	time_t b = time(NULL);

	printTimeUsed(a, b, "EM.cpp");

	return 0;
}
Exemple #8
0
int main(int argc, char* argv[]) {
	if (argc < 2) {
		printf("Usage: PROBer-build-reference refName [--gtf gtfF] [--mapping mappingF] [--allele-specific] [--files num_of_files file_1 file_2 ...] [--n2g-index] [-q]\n");
		exit(-1);
	}

	hasGTF = false;
	mappingType = 0;
	n2g_idx = false;
	
	int argpos = 2;
	while (argpos < argc) {
		if (!strcmp(argv[argpos], "--gtf")) {
			hasGTF = true;
			strcpy(gtfF, argv[++argpos]);
		}
		if (!strcmp(argv[argpos], "--mapping")) {
			mappingType = 1;
			mappingPos = ++argpos;
		}
		if (!strcmp(argv[argpos], "--allele-specific")) mappingType = 2;
		if (!strcmp(argv[argpos], "--files")) {
			num_files = atoi(argv[++argpos]);
			file_pos = argpos + 1; // the position in argv for the first file
			argpos += num_files;
		}
		if (!strcmp(argv[argpos], "--n2g-index")) n2g_idx = true;
		if (!strcmp(argv[argpos], "-q")) verbose = false;
		++argpos;
	}

	if (mappingType > 0) loadMappingInfo(mappingType, argv[mappingPos]);

	ifstream fin;
	string line, gseq, tseq; // gseq, genomic sequence; tseq, transcript sequence
	string seqname, gene_id, transcript_id;
	
	if (hasGTF) {
		transcripts.setType(0);
		assert(mappingType < 2);
		parse_gtf_file(gtfF);

		M = transcripts.getM();
		general_assert(M > 0, "The reference contains no transcripts!");
		seqs.assign(M + 1, "");
		
		chrvec.clear();
		
		for (int i = 0; i < num_files; ++i, ++file_pos) {
			fin.open(argv[file_pos]);
			general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[file_pos]) + "! It may not exist.");
			getline(fin, line);
			while ((fin) && (line[0] == '>')) {
	istringstream strin(line.substr(1));
	strin>>seqname;
	
	gseq = "";
	while((getline(fin, line)) && (line[0] != '>')) {
		gseq += line;
	}
	assert(gseq.length() > 0);
			
	sn2tr_iter = sn2tr.find(seqname);
	if (sn2tr_iter == sn2tr.end()) continue;
	
	chrvec.push_back(ChrInfo(seqname, gseq.length()));
	
	vector<int>& vec = sn2tr_iter->second;
	int s = vec.size();
	for (int j = 0; j < s; ++j) {
		assert(vec[j] > 0 && vec[j] <= M);
		transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]);
	}
			}
			fin.close();

			if (verbose) { printf("%s is processed!\n", argv[file_pos]); } 
		}
		
		sort(chrvec.begin(), chrvec.end());

		// Shrink and build up Refs
		int curp = 0;
		for (int i = 1; i <= M; ++i) {
			const Transcript& transcript = transcripts.getTranscriptAt(i);
			if (seqs[i] == "") 
	printf("Warning: Cannot extract transcript %s because the chromosome it locates -- %s -- is absent!\n", transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str());
			else {
	refs.addRef(transcript.getTranscriptID(), seqs[i]); // insert RefSeqs
	++curp;
	transcripts.move(i, curp);
			}
		}
		printf("%d transcripts are extracted and %d transcripts are omitted.\n", curp, M - curp);
		
		transcripts.setM(curp);
		M = transcripts.getM();
		general_assert(M > 0, "The reference contains no transcripts!");
		assert(refs.getM() == M);
	}
	else {
Exemple #9
0
void writeToDisk(char* refName) {
	ofstream fout;

	sprintf(tiF, "%s.ti", refName);
	transcripts.writeTo(tiF);
	if (verbose) { printf("Transcript Information File is generated!\n"); }
	
	sprintf(refFastaF, "%s.transcripts.fa", refName);
	refs.writeTo(refFastaF);

	sprintf(transListF, "%s.translist", refName);
	refs.writeTransListTo(transListF);

	sprintf(chromListF, "%s.chrlist", refName);
	fout.open(chromListF);
	for (int i = 0; i < (int)chrvec.size(); ++i)
		fout<< chrvec[i].name<< '\t'<< chrvec[i].len<< endl;
	fout.close();
	if (verbose) { printf("Chromosome List File is generated!\n"); }
	
	string cur_gene_id, cur_transcript_id, name;
	vector<int> gi, gt, ta;

	cur_gene_id = ""; gi.clear(); 
	if (mappingType == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); }
	for (int i = 1; i <= M; ++i) {
		const Transcript& transcript = transcripts.getTranscriptAt(i);
		if (cur_gene_id != transcript.getGeneID()) {
			gi.push_back(i);
			if (mappingType == 2) gt.push_back((int)ta.size());
			cur_gene_id = transcript.getGeneID();
		}
		if ((mappingType == 2) && (cur_transcript_id != transcript.getTranscriptID())) {
			ta.push_back(i);
			cur_transcript_id = transcript.getTranscriptID();
		}
	}
	
	gi.push_back(M + 1);
	if (mappingType == 2) { gt.push_back((int)ta.size()); ta.push_back(M + 1); }

	sprintf(groupF, "%s.grp", refName);
	fout.open(groupF);
	for (int i = 0; i < (int)gi.size(); ++i) fout<< gi[i]<< endl;
	fout.close();
	if (verbose) { printf("Group File is generated!\n"); }

	if (mappingType == 2) {
		sprintf(gtF, "%s.gt", refName);
		fout.open(gtF);
		for (int i = 0; i < (int)gt.size(); ++i) fout<< gt[i]<< endl;
		fout.close();
		sprintf(taF, "%s.ta", refName);
		fout.open(taF);
		for (int i = 0; i < (int)ta.size(); ++i) fout<< ta[i]<< endl;
		fout.close();
		if (verbose) { printf("Allele-specific group files are generated!\n"); }
	}

	if (n2g_idx) {
		sprintf(n2g_idxF, "%s.n2g.idx.fa", refName);
		fout.open(n2g_idxF);
		for (int i = 1; i <= M; ++i) 
			fout<< '>'<< refs.getRef(i)->getName()<< endl<< n2g(refs.getRef(i)->getSeq())<< endl;
		fout.close();
		if (verbose) printf("%s is generated!\n", n2g_idxF);
	}
}
Exemple #10
0
int main(int argc, char* argv[]) {
	if (argc < 8) {
		printf("Usage: rsem-calculate-credibility-intervals reference_name sample_name sampleToken confidence nCV nSpC nMB [-p #Threads] [-q]\n");
		exit(-1);
	}

	confidence = atof(argv[4]);
	nCV = atoi(argv[5]);
	nSpC = atoi(argv[6]);
	nMB = atoi(argv[7]);

	nThreads = 1;
	quiet = false;
	for (int i = 8; i < argc; i++) {
		if (!strcmp(argv[i], "-p")) nThreads = atoi(argv[i + 1]);
		if (!strcmp(argv[i], "-q")) quiet = true;
	}
	verbose = !quiet;

	sprintf(refF, "%s.seq", argv[1]);
	refs.loadRefs(refF, 1);
	M = refs.getM();
	sprintf(groupF, "%s.grp", argv[1]);
	gi.load(groupF);
	m = gi.getm();

	nSamples = nCV * nSpC;
	cvlen = M + 1;
	assert(nSamples > 0 && cvlen > 1); // for Buffter.h: (bufsize_type)nSamples

	sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
	sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
	sprintf(tmpF, "%s.tmp", imdName);
	sprintf(cvsF, "%s.countvectors", imdName);

	sprintf(modelF, "%s.model", statName);
	FILE *fi = fopen(modelF, "r");
	general_assert(fi != NULL, "Cannot open " + cstrtos(modelF) + "!");
	assert(fscanf(fi, "%d", &model_type) == 1);
	fclose(fi);

	// Phase I
	switch(model_type) {
	case 0 : sample_theta_vectors_from_count_vectors<SingleModel>(); break;
	case 1 : sample_theta_vectors_from_count_vectors<SingleQModel>(); break;
	case 2 : sample_theta_vectors_from_count_vectors<PairedEndModel>(); break;
	case 3 : sample_theta_vectors_from_count_vectors<PairedEndQModel>(); break;
	}

	// Phase II
	calculate_credibility_intervals(imdName);

	/*
	sprintf(command, "rm -f %s", tmpF);
	int status = system(command);
	if (status != 0) {
		fprintf(stderr, "Cannot delete %s!\n", tmpF);
		exit(-1);
	}
	*/

	return 0;
}
Exemple #11
0
int main(int argc, char* argv[]) {
	ifstream fin;
	bool quiet = false;

	if (argc < 6) {
		printf("Usage : rsem-run-em refName read_type sampleName imdName statName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling] [--seed seed] [--calc-evaluation-score nb_r nb_p L w]\n\n");
		printf("  refName: reference name\n");
		printf("  read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
		printf("  sampleName: sample's name, including the path\n");
		printf("  sampleToken: sampleName excludes the path\n");
		printf("  -p: number of threads which user wants to use. (default: 1)\n");
		printf("  -b: produce bam format output file. (default: off)\n");
		printf("  -q: set it quiet\n");
		printf("  --gibbs-out: generate output file use by Gibbs sampler. (default: off)\n");
		printf("  --sampling: sample each read from its posterior distribution when bam file is generated. (default: off)\n");
		printf("  --seed uint32: the seed used for the BAM sampling. (default: off)\n");
		printf("  --calc-evaluation-score nb_r nb_p L w: "
				"nb_r and nb_p are parameters for the true transcript length distribution, which is modeled by a negative binomial distribution; "
				"L is the read length and w is the mininum overlap required for joining two contigs.\n");
		printf("// model parameters should be in imdName.mparams.\n");
		exit(-1);
	}

	time_t a = time(NULL);

	strcpy(refName, argv[1]);
	read_type = atoi(argv[2]);
	strcpy(outName, argv[3]);
	strcpy(imdName, argv[4]);
	strcpy(statName, argv[5]);

	nThreads = 1;

	genBamF = false;
	bamSampling = false;
	genGibbsOut = false;
	calcEvalScore = false;
	pt_fn_list = NULL;
	hasSeed = false;

	for (int i = 6; i < argc; i++) {
		if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); }
		if (!strcmp(argv[i], "-b")) {
			genBamF = true;
			inpSamType = argv[i + 1][0];
			strcpy(inpSamF, argv[i + 2]);
			if (atoi(argv[i + 3]) == 1) {
				strcpy(fn_list, argv[i + 4]);
				pt_fn_list = (char*)(&fn_list);
			}
		}
		if (!strcmp(argv[i], "-q")) { quiet = true; }
		if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; }
		if (!strcmp(argv[i], "--sampling")) { bamSampling = true; }
		if (!strcmp(argv[i], "--seed")) {
		  hasSeed = true;
		  int len = strlen(argv[i + 1]);
		  seed = 0;
		  for (int k = 0; k < len; k++) seed = seed * 10 + (argv[i + 1][k] - '0');
		}
		if (!strcmp(argv[i], "--calc-evaluation-score")) {
			calcEvalScore = true;
			nb_r = atof(argv[i + 1]);
			nb_p = atof(argv[i + 2]);
			L = atoi(argv[i + 3]);
			w = atoi(argv[i + 4]);
		}
	}

	general_assert(nThreads > 0, "Number of threads should be bigger than 0!");

	verbose = !quiet;

	//basic info loading
	sprintf(refF, "%s.seq", refName);
	refs.loadRefs(refF);
	M = refs.getM();

	sprintf(tiF, "%s.ti", refName);
	transcripts.readFrom(tiF);

	sprintf(cntF, "%s.cnt", statName);
	fin.open(cntF);

	general_assert(fin.is_open(), "Cannot open " + cstrtos(cntF) + "! It may not exist.");

	fin>>N0>>N1>>N2>>N_tot;
	fin.close();

	general_assert(N1 > 0, "There are no alignable reads!");

	if ((READ_INT_TYPE)nThreads > N1) nThreads = N1;

	//set model parameters
	mparams.M = M;
	mparams.N[0] = N0; mparams.N[1] = N1; mparams.N[2] = N2;
	mparams.refs = &refs;

	sprintf(mparamsF, "%s.mparams", imdName);
	fin.open(mparamsF);

	general_assert(fin.is_open(), "Cannot open " + cstrtos(mparamsF) + "It may not exist.");

	fin>> mparams.minL>> mparams.maxL>> mparams.probF;
	int val; // 0 or 1 , for estRSPD
	fin>>val;
	mparams.estRSPD = (val != 0);
	fin>> mparams.B>> mparams.mate_minL>> mparams.mate_maxL>> mparams.mean>> mparams.sd;
	fin>> mparams.seedLen;
	fin.close();

	//run EM
	switch(read_type) {
	case 0 : EM<SingleRead, SingleHit, SingleModel>(); break;
	case 1 : EM<SingleReadQ, SingleHit, SingleQModel>(); break;
	case 2 : EM<PairedEndRead, PairedEndHit, PairedEndModel>(); break;
	case 3 : EM<PairedEndReadQ, PairedEndHit, PairedEndQModel>(); break;
	default : fprintf(stderr, "Unknown Read Type!\n"); exit(-1);
	}

	if (calcEvalScore) {
		CalcEvalScore ces(refs, nb_r, nb_p, L, w, statName);
		sprintf(scoreF, "%s.score", outName);
		ces.writeScoresTo(scoreF);
		
		char groupF[STRLEN];
		GroupInfo gi;
		sprintf(groupF, "%s.grp", argv[1]);
		gi.load(groupF);

		ces.generateExpressionFiles(gi, transcripts, scoreF);
	}

	time_t b = time(NULL);

	printTimeUsed(a, b, "EM.cpp");

	return 0;
}