void parse_gtf_file(char* gtfF) { ifstream fin(gtfF); string line, tid, gid; GTFItem item; general_assert(fin.is_open(), "Cannot open " + cstrtos(gtfF) + "! It may not exist."); int cnt = 0; items.clear(); while (getline(fin, line)) { if (skip(line)) continue; item.parse(line); string feature = item.getFeature(); if (feature == "exon") { if (item.getStart() > item.getEnd()) { printf("Warning: exon's start position is larger than its end position! This exon is discarded.\n"); printf("\t%s\n\n", line.c_str()); } else if (item.getStart() < 1) { printf("Warning: exon's start position is less than 1! This exon is discarded.\n"); printf("\t%s\n\n", line.c_str()); } else { item.parseAttributes(line); if (mappingType > 0) { tid = item.getTranscriptID(); mi_iter = mi_table.find(tid); general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + tid + "'s gene_id!"); gid = mi_iter->second; item.setGeneID(gid); } items.push_back(item); } } ++cnt; if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); } } fin.close(); sort(items.begin(), items.end()); int sp = 0, ep; // start pointer, end pointer int nItems = items.size(); sn2tr.clear(); while (sp < nItems) { tid = items[sp].getTranscriptID(); ep = sp + 1; while (ep < nItems && items[ep].getTranscriptID() == tid) ++ep; --ep; buildTranscript(sp, ep); int sid = transcripts.getM(); const Transcript& transcript = transcripts.getTranscriptAt(sid); sn2tr_iter = sn2tr.find(transcript.getSeqName()); if (sn2tr_iter == sn2tr.end()) { vector<int> vec(1, sid); sn2tr[transcript.getSeqName()] = vec; } else { sn2tr_iter->second.push_back(sid); } sp = ep + 1; } items.clear(); if (verbose) { printf("Parsing GTF File is done!\n"); } }
void parse_gtf_file(char* gtfF) { ifstream fin(gtfF); string line, curgid, tid, gid; // curgid: current gene id; GTFItem item; if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", gtfF); exit(-1); } int cnt = 0; items.clear(); while (getline(fin, line)) { if (line[0] == '#') continue; // if this line is comment, jump it item.parse(line); string feature = item.getFeature(); if (feature == "exon") { if (item.getStart() > item.getEnd()) { fprintf(stderr, "Warning: exon's start position is larger than its end position! This exon is discarded.\n"); fprintf(stderr, "\t%s\n\n", line.c_str()); } else if (item.getStart() < 1) { fprintf(stderr, "Warning: exon's start position is less than 1! This exon is discarded.\n"); fprintf(stderr, "\t%s\n\n", line.c_str()); } else { if (hasMappingFile) { tid = item.getTranscriptID(); mi_iter = mi_table.find(tid); if (mi_iter == mi_table.end()) { fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", tid.c_str()); exit(-1); } //assert(iter != table.end()); gid = mi_iter->second; item.setGeneID(gid); } items.push_back(item); } } ++cnt; if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); } } fin.close(); sort(items.begin(), items.end()); starts.clear(); sn2tr.clear(); curgid = ""; int sp = 0, ep; // start pointer, end pointer int nItems = items.size(); while (sp < nItems) { tid = items[sp].getTranscriptID(); gid = items[sp].getGeneID(); ep = sp + 1; while (ep < nItems && items[ep].getTranscriptID() == tid) ep++; ep--; buildTranscript(sp, ep); int sid = transcripts.getM(); const Transcript& transcript = transcripts.getTranscriptAt(sid); if (curgid != gid) { starts.push_back(sid); curgid = gid; } iter = sn2tr.find(transcript.getSeqName()); if (iter == sn2tr.end()) { vector<int> vec(1, sid); sn2tr[transcript.getSeqName()] = vec; } else { iter->second.push_back(sid); } sp = ep + 1; } M = transcripts.getM(); starts.push_back(M + 1); items.clear(); if (M < 1) { fprintf(stderr, "Number of transcripts in the reference is less than 1!\n"); exit(-1); } if (verbose) { printf("Parsing gtf File is done!\n"); } }