void writeResults(char* refName) { int s; ofstream fout; sprintf(groupF, "%s.grp", refName); sprintf(tiF, "%s.ti", refName); sprintf(refFastaF, "%s.transcripts.fa", refName); sprintf(chromListF, "%s.chrlist", refName); fout.open(groupF); s = starts.size(); for (int i = 0; i < s; i++) fout<<starts[i]<<endl; fout.close(); if (verbose) { printf("Group File is generated!\n"); } transcripts.writeTo(tiF); if (verbose) { printf("Transcript Information File is generated!\n"); } fout.open(chromListF); s = chrvec.size(); for (int i = 0; i < s; i++) { fout<<chrvec[i].name<<'\t'<<chrvec[i].len<<endl; } fout.close(); if (verbose) { printf("Chromosome List File is generated!\n"); } fout.open(refFastaF); for (int i = 1; i <= M; i++) { fout<<">"<<transcripts.getTranscriptAt(i).getTranscriptID()<<endl; fout<<seqs[i]<<endl; } fout.close(); if (verbose) { printf("Extracted Sequences File is generated!\n"); } }
void buildTranscript(int sp, int ep) { int cur_s, cur_e; // current_start, current_end string transcript_id = items[sp].getTranscriptID(); string gene_id = items[sp].getGeneID(); string gene_name = "", transcript_name = ""; char strand = items[sp].getStrand(); string seqname = items[sp].getSeqName(); string left = items[sp].getLeft(); vec.clear(); cur_s = cur_e = -1; for (int i = sp; i <= ep; ++i) { int start = items[i].getStart(); int end = items[i].getEnd(); general_assert(strand == items[i].getStrand(), "According to the GTF file given, a transcript has exons from different orientations!"); general_assert(seqname == items[i].getSeqName(), "According to the GTF file given, a transcript has exons on multiple chromosomes!"); if (items[i].getGeneName() != "") { if (gene_name == "") gene_name = items[i].getGeneName(); else general_assert(gene_name == items[i].getGeneName(), "A transcript is associated with multiple gene names!"); } if (items[i].getTranscriptName() != "") { if (transcript_name == "") transcript_name = items[i].getTranscriptName(); else general_assert(transcript_name == items[i].getTranscriptName(), "A transcript is associated with multiple transcript names!"); } if (cur_e + 1 < start) { if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e)); cur_s = start; } cur_e = (cur_e < end ? end : cur_e); } if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e)); // if (gene_name != "") gene_id += "_" + gene_name; // if (transcript_name != "") transcript_id += "_" + transcript_name; transcripts.add(Transcript(transcript_id, gene_id, seqname, strand, vec, left)); }
bool buildTranscript(int sp, int ep) { int cur_s, cur_e; // current_start, current_end vector<Interval> vec; string transcript_id = items[sp].getTranscriptID(); string gene_id = items[sp].getGeneID(); char strand = items[sp].getStrand(); string seqname = items[sp].getSeqName(); string left = items[sp].getLeft(); vec.clear(); cur_s = cur_e = -1; for (int i = sp; i <= ep; i++) { int start = items[i].getStart(); int end = items[i].getEnd(); if (strand != items[i].getStrand()) { fprintf(stderr, "According to the GTF file given, a transcript has exons from different orientations!\n"); exit(-1); } if (seqname != items[i].getSeqName()) { fprintf(stderr, "According to the GTF file given, a transcript has exons on multiple chromosomes!\n"); exit(-1); } if (cur_e + 1 < start) { if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e)); cur_s = start; } cur_e = (cur_e < end ? end : cur_e); } if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e)); transcripts.add(Transcript(transcript_id, gene_id, seqname, strand, vec, left)); return true; }
int main(int argc, char* argv[]) { ifstream fin; bool quiet = false; if (argc < 5) { printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n"); printf(" refName: reference name\n"); printf(" read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n"); printf(" sampleName: sample's name, including the path\n"); printf(" sampleToken: sampleName excludes the path\n"); printf(" -p: number of threads which user wants to use. (default: 1)\n"); printf(" -b: produce bam format output file. (default: off)\n"); printf(" -q: set it quiet\n"); printf(" --gibbs-out: generate output file used by Gibbs sampler. (default: off)\n"); printf(" --sampling: sample each read from its posterior distribution when bam file is generated. (default: off)\n"); printf("// model parameters should be in imdName.mparams.\n"); exit(-1); } time_t a = time(NULL); strcpy(refName, argv[1]); read_type = atoi(argv[2]); strcpy(outName, argv[3]); sprintf(imdName, "%s.temp/%s", argv[3], argv[4]); sprintf(statName, "%s.stat/%s", argv[3], argv[4]); nThreads = 1; genBamF = false; bamSampling = false; genGibbsOut = false; pt_fn_list = pt_chr_list = NULL; for (int i = 5; i < argc; i++) { if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); } if (!strcmp(argv[i], "-b")) { genBamF = true; inpSamType = argv[i + 1][0]; strcpy(inpSamF, argv[i + 2]); if (atoi(argv[i + 3]) == 1) { strcpy(fn_list, argv[i + 4]); pt_fn_list = (char*)(&fn_list); } } if (!strcmp(argv[i], "-q")) { quiet = true; } if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; } if (!strcmp(argv[i], "--sampling")) { bamSampling = true; } } general_assert(nThreads > 0, "Number of threads should be bigger than 0!"); verbose = !quiet; //basic info loading sprintf(refF, "%s.seq", refName); refs.loadRefs(refF); M = refs.getM(); sprintf(groupF, "%s.grp", refName); gi.load(groupF); m = gi.getm(); sprintf(tiF, "%s.ti", refName); transcripts.readFrom(tiF); sprintf(cntF, "%s.cnt", statName); fin.open(cntF); general_assert(fin.is_open(), "Cannot open " + cstrtos(cntF) + "! It may not exist."); fin>>N0>>N1>>N2>>N_tot; fin.close(); general_assert(N1 > 0, "There are no alignable reads!"); if ((READ_INT_TYPE)nThreads > N1) nThreads = N1; //set model parameters mparams.M = M; mparams.N[0] = N0; mparams.N[1] = N1; mparams.N[2] = N2; mparams.refs = &refs; sprintf(mparamsF, "%s.mparams", imdName); fin.open(mparamsF); general_assert(fin.is_open(), "Cannot open " + cstrtos(mparamsF) + "It may not exist."); fin>> mparams.minL>> mparams.maxL>> mparams.probF; int val; // 0 or 1 , for estRSPD fin>>val; mparams.estRSPD = (val != 0); fin>> mparams.B>> mparams.mate_minL>> mparams.mate_maxL>> mparams.mean>> mparams.sd; fin>> mparams.seedLen; fin.close(); //run EM switch(read_type) { case 0 : EM<SingleRead, SingleHit, SingleModel>(); break; case 1 : EM<SingleReadQ, SingleHit, SingleQModel>(); break; case 2 : EM<PairedEndRead, PairedEndHit, PairedEndModel>(); break; case 3 : EM<PairedEndReadQ, PairedEndHit, PairedEndQModel>(); break; default : fprintf(stderr, "Unknown Read Type!\n"); exit(-1); } time_t b = time(NULL); printTimeUsed(a, b, "EM.cpp"); return 0; }
void writeResults(ModelType& model, double* counts) { double denom; char outF[STRLEN]; FILE *fo; sprintf(modelF, "%s.model", statName); model.write(modelF); //calculate tau values double *tau = new double[M + 1]; memset(tau, 0, sizeof(double) * (M + 1)); denom = 0.0; for (int i = 1; i <= M; i++) if (eel[i] >= EPSILON) { tau[i] = theta[i] / eel[i]; denom += tau[i]; } general_assert(denom > 0, "No alignable reads?!"); for (int i = 1; i <= M; i++) { tau[i] /= denom; } //isoform level results sprintf(outF, "%s.iso_res", imdName); fo = fopen(outF, "w"); for (int i = 1; i <= M; i++) { const Transcript& transcript = transcripts.getTranscriptAt(i); fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n')); } for (int i = 1; i <= M; i++) fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) { const Transcript& transcript = transcripts.getTranscriptAt(i); fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n')); } fclose(fo); //gene level results sprintf(outF, "%s.gene_res", imdName); fo = fopen(outF, "w"); for (int i = 0; i < m; i++) { const string& gene_id = transcripts.getTranscriptAt(gi.spAt(i)).getGeneID(); fprintf(fo, "%s%c", gene_id.c_str(), (i < m - 1 ? '\t' : '\n')); } for (int i = 0; i < m; i++) { double sumC = 0.0; // sum of counts int b = gi.spAt(i), e = gi.spAt(i + 1); for (int j = b; j < e; j++) sumC += counts[j]; fprintf(fo, "%.2f%c", sumC, (i < m - 1 ? '\t' : '\n')); } for (int i = 0; i < m; i++) { double sumT = 0.0; // sum of tau values int b = gi.spAt(i), e = gi.spAt(i + 1); for (int j = b; j < e; j++) sumT += tau[j]; fprintf(fo, "%.15g%c", sumT, (i < m - 1 ? '\t' : '\n')); } for (int i = 0; i < m; i++) { int b = gi.spAt(i), e = gi.spAt(i + 1); for (int j = b; j < e; j++) { fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : (i < m - 1 ? '\t' :'\n'))); } } fclose(fo); delete[] tau; if (verbose) { printf("Expression Results are written!\n"); } }
int main(int argc, char* argv[]) { bool quiet = false; if (argc < 6) { printf("Usage : rsem-parse-alignments refName imdName statName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n"); exit(-1); } strcpy(fn_list, ""); read_type = 0; if (argc > 6) { for (int i = 6; i < argc; i++) { if (!strcmp(argv[i], "-t")) { read_type = atoi(argv[i + 1]); } if (!strcmp(argv[i], "-l")) { strcpy(fn_list, argv[i + 1]); } if (!strcmp(argv[i], "-tag")) { SamParser::setReadTypeTag(argv[i + 1]); } if (!strcmp(argv[i], "-q")) { quiet = true; } } } verbose = !quiet; sprintf(groupF, "%s.grp", argv[1]); gi.load(groupF); sprintf(tiF, "%s.ti", argv[1]); transcripts.readFrom(tiF); sprintf(datF, "%s.dat", argv[2]); sprintf(cntF, "%s.cnt", argv[3]); init(argv[2], argv[4][0], argv[5]); hit_out.open(datF); string firstLine(99, ' '); firstLine.append(1, '\n'); //May be dangerous! hit_out<<firstLine; switch(read_type) { case 0 : parseIt<SingleRead, SingleHit>(parser); break; case 1 : parseIt<SingleReadQ, SingleHit>(parser); break; case 2 : parseIt<PairedEndRead, PairedEndHit>(parser); break; case 3 : parseIt<PairedEndReadQ, PairedEndHit>(parser); break; } hit_out.seekp(0, ios_base::beg); hit_out<<N[1]<<" "<<nHits<<" "<<read_type; hit_out.close(); //cntF for statistics of alignments file ofstream fout(cntF); fout<<N[0]<<" "<<N[1]<<" "<<N[2]<<" "<<(N[0] + N[1] + N[2])<<endl; fout<<nUnique<<" "<<nMulti<<" "<<nIsoMulti<<endl; fout<<nHits<<" "<<read_type<<endl; fout<<"0\t"<<N[0]<<endl; for (iter = counter.begin(); iter != counter.end(); iter++) { fout<<iter->first<<'\t'<<iter->second<<endl; } fout<<"Inf\t"<<N[2]<<endl; fout.close(); release(); if (verbose) { printf("Done!\n"); } return 0; }
int main(int argc, char* argv[]) { if (argc < 2) { printf("Usage: PROBer-build-reference refName [--gtf gtfF] [--mapping mappingF] [--allele-specific] [--files num_of_files file_1 file_2 ...] [--n2g-index] [-q]\n"); exit(-1); } hasGTF = false; mappingType = 0; n2g_idx = false; int argpos = 2; while (argpos < argc) { if (!strcmp(argv[argpos], "--gtf")) { hasGTF = true; strcpy(gtfF, argv[++argpos]); } if (!strcmp(argv[argpos], "--mapping")) { mappingType = 1; mappingPos = ++argpos; } if (!strcmp(argv[argpos], "--allele-specific")) mappingType = 2; if (!strcmp(argv[argpos], "--files")) { num_files = atoi(argv[++argpos]); file_pos = argpos + 1; // the position in argv for the first file argpos += num_files; } if (!strcmp(argv[argpos], "--n2g-index")) n2g_idx = true; if (!strcmp(argv[argpos], "-q")) verbose = false; ++argpos; } if (mappingType > 0) loadMappingInfo(mappingType, argv[mappingPos]); ifstream fin; string line, gseq, tseq; // gseq, genomic sequence; tseq, transcript sequence string seqname, gene_id, transcript_id; if (hasGTF) { transcripts.setType(0); assert(mappingType < 2); parse_gtf_file(gtfF); M = transcripts.getM(); general_assert(M > 0, "The reference contains no transcripts!"); seqs.assign(M + 1, ""); chrvec.clear(); for (int i = 0; i < num_files; ++i, ++file_pos) { fin.open(argv[file_pos]); general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[file_pos]) + "! It may not exist."); getline(fin, line); while ((fin) && (line[0] == '>')) { istringstream strin(line.substr(1)); strin>>seqname; gseq = ""; while((getline(fin, line)) && (line[0] != '>')) { gseq += line; } assert(gseq.length() > 0); sn2tr_iter = sn2tr.find(seqname); if (sn2tr_iter == sn2tr.end()) continue; chrvec.push_back(ChrInfo(seqname, gseq.length())); vector<int>& vec = sn2tr_iter->second; int s = vec.size(); for (int j = 0; j < s; ++j) { assert(vec[j] > 0 && vec[j] <= M); transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]); } } fin.close(); if (verbose) { printf("%s is processed!\n", argv[file_pos]); } } sort(chrvec.begin(), chrvec.end()); // Shrink and build up Refs int curp = 0; for (int i = 1; i <= M; ++i) { const Transcript& transcript = transcripts.getTranscriptAt(i); if (seqs[i] == "") printf("Warning: Cannot extract transcript %s because the chromosome it locates -- %s -- is absent!\n", transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str()); else { refs.addRef(transcript.getTranscriptID(), seqs[i]); // insert RefSeqs ++curp; transcripts.move(i, curp); } } printf("%d transcripts are extracted and %d transcripts are omitted.\n", curp, M - curp); transcripts.setM(curp); M = transcripts.getM(); general_assert(M > 0, "The reference contains no transcripts!"); assert(refs.getM() == M); } else {
void writeToDisk(char* refName) { ofstream fout; sprintf(tiF, "%s.ti", refName); transcripts.writeTo(tiF); if (verbose) { printf("Transcript Information File is generated!\n"); } sprintf(refFastaF, "%s.transcripts.fa", refName); refs.writeTo(refFastaF); sprintf(transListF, "%s.translist", refName); refs.writeTransListTo(transListF); sprintf(chromListF, "%s.chrlist", refName); fout.open(chromListF); for (int i = 0; i < (int)chrvec.size(); ++i) fout<< chrvec[i].name<< '\t'<< chrvec[i].len<< endl; fout.close(); if (verbose) { printf("Chromosome List File is generated!\n"); } string cur_gene_id, cur_transcript_id, name; vector<int> gi, gt, ta; cur_gene_id = ""; gi.clear(); if (mappingType == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); } for (int i = 1; i <= M; ++i) { const Transcript& transcript = transcripts.getTranscriptAt(i); if (cur_gene_id != transcript.getGeneID()) { gi.push_back(i); if (mappingType == 2) gt.push_back((int)ta.size()); cur_gene_id = transcript.getGeneID(); } if ((mappingType == 2) && (cur_transcript_id != transcript.getTranscriptID())) { ta.push_back(i); cur_transcript_id = transcript.getTranscriptID(); } } gi.push_back(M + 1); if (mappingType == 2) { gt.push_back((int)ta.size()); ta.push_back(M + 1); } sprintf(groupF, "%s.grp", refName); fout.open(groupF); for (int i = 0; i < (int)gi.size(); ++i) fout<< gi[i]<< endl; fout.close(); if (verbose) { printf("Group File is generated!\n"); } if (mappingType == 2) { sprintf(gtF, "%s.gt", refName); fout.open(gtF); for (int i = 0; i < (int)gt.size(); ++i) fout<< gt[i]<< endl; fout.close(); sprintf(taF, "%s.ta", refName); fout.open(taF); for (int i = 0; i < (int)ta.size(); ++i) fout<< ta[i]<< endl; fout.close(); if (verbose) { printf("Allele-specific group files are generated!\n"); } } if (n2g_idx) { sprintf(n2g_idxF, "%s.n2g.idx.fa", refName); fout.open(n2g_idxF); for (int i = 1; i <= M; ++i) fout<< '>'<< refs.getRef(i)->getName()<< endl<< n2g(refs.getRef(i)->getSeq())<< endl; fout.close(); if (verbose) printf("%s is generated!\n", n2g_idxF); } }
void parse_gtf_file(char* gtfF) { ifstream fin(gtfF); string line, tid, gid; GTFItem item; general_assert(fin.is_open(), "Cannot open " + cstrtos(gtfF) + "! It may not exist."); int cnt = 0; items.clear(); while (getline(fin, line)) { if (skip(line)) continue; item.parse(line); string feature = item.getFeature(); if (feature == "exon") { if (item.getStart() > item.getEnd()) { printf("Warning: exon's start position is larger than its end position! This exon is discarded.\n"); printf("\t%s\n\n", line.c_str()); } else if (item.getStart() < 1) { printf("Warning: exon's start position is less than 1! This exon is discarded.\n"); printf("\t%s\n\n", line.c_str()); } else { item.parseAttributes(line); if (mappingType > 0) { tid = item.getTranscriptID(); mi_iter = mi_table.find(tid); general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + tid + "'s gene_id!"); gid = mi_iter->second; item.setGeneID(gid); } items.push_back(item); } } ++cnt; if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); } } fin.close(); sort(items.begin(), items.end()); int sp = 0, ep; // start pointer, end pointer int nItems = items.size(); sn2tr.clear(); while (sp < nItems) { tid = items[sp].getTranscriptID(); ep = sp + 1; while (ep < nItems && items[ep].getTranscriptID() == tid) ++ep; --ep; buildTranscript(sp, ep); int sid = transcripts.getM(); const Transcript& transcript = transcripts.getTranscriptAt(sid); sn2tr_iter = sn2tr.find(transcript.getSeqName()); if (sn2tr_iter == sn2tr.end()) { vector<int> vec(1, sid); sn2tr[transcript.getSeqName()] = vec; } else { sn2tr_iter->second.push_back(sid); } sp = ep + 1; } items.clear(); if (verbose) { printf("Parsing GTF File is done!\n"); } }
int main(int argc, char* argv[]) { if (argc < 6 || ((hasMappingFile = atoi(argv[4])) && argc < 7)) { printf("Usage: rsem-extract-reference-transcripts refName quiet gtfF hasMappingFile [mappingFile] chromosome_file_1 [chromosome_file_2 ...]\n"); exit(-1); } verbose = !atoi(argv[2]); if (hasMappingFile) { loadMappingInfo(argv[5]); } parse_gtf_file(argv[3]); ifstream fin; string line, gseq, seqname; chrvec.clear(); seqs.clear(); seqs.resize(M + 1, ""); int start = hasMappingFile ? 6 : 5; for (int i = start; i < argc; i++) { fin.open(argv[i]); if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[i]); exit(-1); } getline(fin, line); while ((fin) && (line[0] == '>')) { istringstream strin(line.substr(1)); strin>>seqname; gseq = ""; while((getline(fin, line)) && (line[0] != '>')) { gseq += line; } size_t len = gseq.length(); assert(len > 0); for (size_t j = 0; j < len; j++) gseq[j] = check(gseq[j]); iter = sn2tr.find(seqname); if (iter == sn2tr.end()) continue; chrvec.push_back(ChrInfo(seqname, len)); vector<int>& vec = iter->second; int s = vec.size(); for (int j = 0; j < s; j++) { assert(vec[j] > 0 && vec[j] <= M); transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]); } } fin.close(); if (verbose) { printf("%s is processed!\n", argv[i]); } } for (int i = 1; i <= M; i++) { if (seqs[i] == "") { const Transcript& transcript = transcripts.getTranscriptAt(i); fprintf(stderr, "Cannot extract transcript %s's sequence from chromosome %s! Loading chromosome %s's sequence is failed. Please check if 1) the chromosome directory is set correctly; 2) the list of chromosome files is complete; 3) the FASTA files containing chromosome sequences are not truncated or having wrong format.\n", \ transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str(), transcript.getSeqName().c_str()); exit(-1); } } sort(chrvec.begin(), chrvec.end()); if (verbose) { printf("Extracting sequences is done!\n"); } writeResults(argv[1]); return 0; }
void parse_gtf_file(char* gtfF) { ifstream fin(gtfF); string line, curgid, tid, gid; // curgid: current gene id; GTFItem item; if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", gtfF); exit(-1); } int cnt = 0; items.clear(); while (getline(fin, line)) { if (line[0] == '#') continue; // if this line is comment, jump it item.parse(line); string feature = item.getFeature(); if (feature == "exon") { if (item.getStart() > item.getEnd()) { fprintf(stderr, "Warning: exon's start position is larger than its end position! This exon is discarded.\n"); fprintf(stderr, "\t%s\n\n", line.c_str()); } else if (item.getStart() < 1) { fprintf(stderr, "Warning: exon's start position is less than 1! This exon is discarded.\n"); fprintf(stderr, "\t%s\n\n", line.c_str()); } else { if (hasMappingFile) { tid = item.getTranscriptID(); mi_iter = mi_table.find(tid); if (mi_iter == mi_table.end()) { fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", tid.c_str()); exit(-1); } //assert(iter != table.end()); gid = mi_iter->second; item.setGeneID(gid); } items.push_back(item); } } ++cnt; if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); } } fin.close(); sort(items.begin(), items.end()); starts.clear(); sn2tr.clear(); curgid = ""; int sp = 0, ep; // start pointer, end pointer int nItems = items.size(); while (sp < nItems) { tid = items[sp].getTranscriptID(); gid = items[sp].getGeneID(); ep = sp + 1; while (ep < nItems && items[ep].getTranscriptID() == tid) ep++; ep--; buildTranscript(sp, ep); int sid = transcripts.getM(); const Transcript& transcript = transcripts.getTranscriptAt(sid); if (curgid != gid) { starts.push_back(sid); curgid = gid; } iter = sn2tr.find(transcript.getSeqName()); if (iter == sn2tr.end()) { vector<int> vec(1, sid); sn2tr[transcript.getSeqName()] = vec; } else { iter->second.push_back(sid); } sp = ep + 1; } M = transcripts.getM(); starts.push_back(M + 1); items.clear(); if (M < 1) { fprintf(stderr, "Number of transcripts in the reference is less than 1!\n"); exit(-1); } if (verbose) { printf("Parsing gtf File is done!\n"); } }
int main(int argc, char* argv[]) { ifstream fin; bool quiet = false; if (argc < 6) { printf("Usage : rsem-run-em refName read_type sampleName imdName statName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling] [--seed seed] [--calc-evaluation-score nb_r nb_p L w]\n\n"); printf(" refName: reference name\n"); printf(" read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n"); printf(" sampleName: sample's name, including the path\n"); printf(" sampleToken: sampleName excludes the path\n"); printf(" -p: number of threads which user wants to use. (default: 1)\n"); printf(" -b: produce bam format output file. (default: off)\n"); printf(" -q: set it quiet\n"); printf(" --gibbs-out: generate output file use by Gibbs sampler. (default: off)\n"); printf(" --sampling: sample each read from its posterior distribution when bam file is generated. (default: off)\n"); printf(" --seed uint32: the seed used for the BAM sampling. (default: off)\n"); printf(" --calc-evaluation-score nb_r nb_p L w: " "nb_r and nb_p are parameters for the true transcript length distribution, which is modeled by a negative binomial distribution; " "L is the read length and w is the mininum overlap required for joining two contigs.\n"); printf("// model parameters should be in imdName.mparams.\n"); exit(-1); } time_t a = time(NULL); strcpy(refName, argv[1]); read_type = atoi(argv[2]); strcpy(outName, argv[3]); strcpy(imdName, argv[4]); strcpy(statName, argv[5]); nThreads = 1; genBamF = false; bamSampling = false; genGibbsOut = false; calcEvalScore = false; pt_fn_list = NULL; hasSeed = false; for (int i = 6; i < argc; i++) { if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); } if (!strcmp(argv[i], "-b")) { genBamF = true; inpSamType = argv[i + 1][0]; strcpy(inpSamF, argv[i + 2]); if (atoi(argv[i + 3]) == 1) { strcpy(fn_list, argv[i + 4]); pt_fn_list = (char*)(&fn_list); } } if (!strcmp(argv[i], "-q")) { quiet = true; } if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; } if (!strcmp(argv[i], "--sampling")) { bamSampling = true; } if (!strcmp(argv[i], "--seed")) { hasSeed = true; int len = strlen(argv[i + 1]); seed = 0; for (int k = 0; k < len; k++) seed = seed * 10 + (argv[i + 1][k] - '0'); } if (!strcmp(argv[i], "--calc-evaluation-score")) { calcEvalScore = true; nb_r = atof(argv[i + 1]); nb_p = atof(argv[i + 2]); L = atoi(argv[i + 3]); w = atoi(argv[i + 4]); } } general_assert(nThreads > 0, "Number of threads should be bigger than 0!"); verbose = !quiet; //basic info loading sprintf(refF, "%s.seq", refName); refs.loadRefs(refF); M = refs.getM(); sprintf(tiF, "%s.ti", refName); transcripts.readFrom(tiF); sprintf(cntF, "%s.cnt", statName); fin.open(cntF); general_assert(fin.is_open(), "Cannot open " + cstrtos(cntF) + "! It may not exist."); fin>>N0>>N1>>N2>>N_tot; fin.close(); general_assert(N1 > 0, "There are no alignable reads!"); if ((READ_INT_TYPE)nThreads > N1) nThreads = N1; //set model parameters mparams.M = M; mparams.N[0] = N0; mparams.N[1] = N1; mparams.N[2] = N2; mparams.refs = &refs; sprintf(mparamsF, "%s.mparams", imdName); fin.open(mparamsF); general_assert(fin.is_open(), "Cannot open " + cstrtos(mparamsF) + "It may not exist."); fin>> mparams.minL>> mparams.maxL>> mparams.probF; int val; // 0 or 1 , for estRSPD fin>>val; mparams.estRSPD = (val != 0); fin>> mparams.B>> mparams.mate_minL>> mparams.mate_maxL>> mparams.mean>> mparams.sd; fin>> mparams.seedLen; fin.close(); //run EM switch(read_type) { case 0 : EM<SingleRead, SingleHit, SingleModel>(); break; case 1 : EM<SingleReadQ, SingleHit, SingleQModel>(); break; case 2 : EM<PairedEndRead, PairedEndHit, PairedEndModel>(); break; case 3 : EM<PairedEndReadQ, PairedEndHit, PairedEndQModel>(); break; default : fprintf(stderr, "Unknown Read Type!\n"); exit(-1); } if (calcEvalScore) { CalcEvalScore ces(refs, nb_r, nb_p, L, w, statName); sprintf(scoreF, "%s.score", outName); ces.writeScoresTo(scoreF); char groupF[STRLEN]; GroupInfo gi; sprintf(groupF, "%s.grp", argv[1]); gi.load(groupF); ces.generateExpressionFiles(gi, transcripts, scoreF); } time_t b = time(NULL); printTimeUsed(a, b, "EM.cpp"); return 0; }
void writeResults(int option, char* refName) { ofstream fout, fout2; string cur_gene_id, cur_transcript_id, name; vector<int> gi, gt, ta; sprintf(tiF, "%s.ti", refName); transcripts.writeTo(tiF); if (verbose) { printf("Transcript Information File is generated!\n"); } cur_gene_id = ""; gi.clear(); if (option == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); } for (int i = 1; i <= M; i++) { const Transcript& transcript = transcripts.getTranscriptAt(i); if (cur_gene_id != transcript.getGeneID()) { gi.push_back(i); if (option == 2) gt.push_back((int)ta.size()); cur_gene_id = transcript.getGeneID(); } if ((option == 2) && (cur_transcript_id != transcript.getTranscriptID())) { ta.push_back(i); cur_transcript_id = transcript.getTranscriptID(); } } gi.push_back(M + 1); if (option == 2) { gt.push_back((int)ta.size()); ta.push_back(M + 1); } sprintf(groupF, "%s.grp", refName); fout.open(groupF); for (int i = 0; i < (int)gi.size(); i++) fout<< gi[i]<< endl; fout.close(); if (verbose) { printf("Group File is generated!\n"); } if (option == 2) { sprintf(gtF, "%s.gt", refName); fout.open(gtF); for (int i = 0; i < (int)gt.size(); i++) fout<< gt[i]<< endl; fout.close(); sprintf(taF, "%s.ta", refName); fout.open(taF); for (int i = 0; i < (int)ta.size(); i++) fout<< ta[i]<< endl; fout.close(); if (verbose) { printf("Allele-specific group files are generated!\n"); } } sprintf(refFastaF, "%s.transcripts.fa", refName); sprintf(chromListF, "%s.chrlist", refName); fout2.open(chromListF); fout.open(refFastaF); for (int i = 1; i <= M; i++) { name = transcripts.getTranscriptAt(i).getSeqName(); iter = name2seq.find(name); general_assert(iter != name2seq.end(), "Cannot recognize sequence ID" + name + "!"); fout<<">"<<name<<endl; fout<<iter->second<<endl; fout2<<name<<'\t'<<iter->second.length()<<endl; } fout.close(); fout2.close(); if (verbose) { printf("Chromosome List File is generated!\n"); printf("Extracted Sequences File is generated!\n"); } }
int main(int argc, char* argv[]) { if (argc < 5 || ((hasMappingFile = atoi(argv[3])) && argc < 6)) { printf("Usage: synthesisRef refName quiet hasMappingFile<0,no;1,yes;2,allele-specific> [mappingFile] reference_file_1 [reference_file_2 ...]\n"); exit(-1); } verbose = !atoi(argv[2]); if (hasMappingFile) { loadMappingInfo(hasMappingFile, argv[4]); } // allele-specific if (hasMappingFile == 2) { transcripts.setType(2); } int start = hasMappingFile ? 5 : 4; ifstream fin; string line, gseq; string seqname, gene_id, transcript_id; vector<Interval> vec; M = 0; name2seq.clear(); for (int i = start; i < argc; i++) { fin.open(argv[i]); general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist."); unsigned long int line_no = 0; //Keep track of file line number getline(fin, line); line_no += 1; while ((fin) && (line[0] == '>')) { istringstream strin(line.substr(1)); strin>>seqname; gseq = ""; while((getline(fin, line)) && (line[0] != '>')) { line_no += 1; gseq += line; } int len = gseq.length(); assert(len > 0); for (int j = 0; j < len; j++) gseq[j] = check(gseq[j],line_no); name2seq[seqname] = gseq; transcript_id = seqname; gene_id = seqname; if (hasMappingFile) { mi_iter = mi_table.find(seqname); general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + seqname + "'s gene_id!"); gene_id = mi_iter->second; if (hasMappingFile == 2) { mi_iter2 = mi_table2.find(seqname); general_assert(mi_iter2 != mi_table2.end(), "Mapping Info is not correct, cannot find allele " + seqname + "'s transcript_id!"); transcript_id = mi_iter2->second; } } vec.clear(); vec.push_back(Interval(1, len)); transcripts.add(Transcript(transcript_id, gene_id, seqname, '+', vec, "")); ++M; if (verbose && M % 1000000 == 0) { printf("%d sequences are processed!\n", M); } } fin.close(); } if (M < 1) { fprintf(stderr, "Number of transcripts in the reference is less than 1!\n"); exit(-1); } assert(M == transcripts.getM()); transcripts.sort(); writeResults(hasMappingFile, argv[1]); return 0; }