void mix_genericerror(void) { if (streq(statusline, "") || strfind(statusline, "...") || strifind(statusline, "generating")) mix_status("Failed!"); else mix_status(NULL); }
int chain_select(int hop[], char *chainstr, int maxrem, REMAILER *remailer, int type, BUFFER *feedback) { /* hop[] is returned containing the chain as integers (0 means random like *) * chainstr is the input desired chain such as *,*,*,* * remailer is an input list of remailer details (see mix2_rlist()) */ int len = 0; int i, j, k; BUFFER *chain, *selected, *addr; chain = buf_new(); selected = buf_new(); addr = buf_new(); if (chainstr == NULL || chainstr[0] == '\0') buf_sets(chain, CHAIN); else buf_sets(chain, chainstr); /* put the chain backwards: final hop is in hop[0] */ for (i = chain->length; i >= 0; i--) if (i == 0 || chain->data[i - 1] == ',' || chain->data[i - 1] == ';' || chain->data[i - 1] == ':') { for (j = i; isspace(chain->data[j]);) /* ignore whitespace */ j++; if (chain->data[j] == '\0') break; if (chain->data[j] == '*') k = 0; #if 0 else if (isdigit(chain->data[j])) k = atoi(chain->data + j); #endif /* 0 */ else { buf_sets(selected, chain->data + j); rfc822_addr(selected, addr); buf_clear(selected); buf_getline(addr, selected); if (!selected->length) buf_sets(selected, chain->data + j); for (k = 0; k < maxrem; k++) if (((remailer[k].flags.mix && type == 0) || (remailer[k].flags.cpunk && type == 1) || (remailer[k].flags.newnym && type == 2)) && (streq(remailer[k].name, selected->data) || strieq(remailer[k].addr, selected->data) || (selected->data[0] == '@' && strifind(remailer[k].addr, selected->data)))) break; } if (k < 0 || k >= maxrem) { if (feedback != NULL) { buf_appendf(feedback, "No such remailer: %b", selected); buf_nl(feedback); } #if 0 k = 0; #else /* end of 0 */ len = -1; goto end; #endif /* else not 0 */ } hop[len++] = k; if (len >= 20) { /* array passed in is has length 20 */ if (feedback != NULL) { buf_appends(feedback, "Chain too long.\n"); } break; } if (i > 0) chain->data[i - 1] = '\0'; } end: buf_free(chain); buf_free(selected); buf_free(addr); return len; }
void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate, bool doCluster, bool doCollapseRedundant, bool matchAllIntrons, bool fuzzSpan, bool forceExons) { GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted gffr->showWarnings(this->showWarnings); // keepAttrs mergeCloseExons noExonAttr gffr->readAll(this->fullAttributes, this->mergeCloseExons, this->noExonAttrs); GVec<int> pseudoAttrIds; GVec<int> pseudoFeatureIds; if (this->noPseudo) { GffNameList& fnames = gffr->names->feats; for (int i=0;i<fnames.Count();i++) { char* n=fnames[i]->name; if (startsWith(n, "pseudo")) { pseudoFeatureIds.Add(fnames[i]->idx); } } GffNameList& attrnames = gffr->names->attrs; for (int i=0;i<attrnames.Count();i++) { char* n=attrnames[i]->name; char* p=strifind(n, "pseudo"); if (p==n || (p==n+2 && tolower(n[0])=='i' && tolower(n[1])=='s')) { pseudoAttrIds.Add(attrnames[i]->idx); } } } //int redundant=0; //redundant annotation discarded if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars()); //int rna_deleted=0; //add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates for (int k=0;k<gffr->gflst.Count();k++) { GffObj* m=gffr->gflst[k]; if (strcmp(m->getFeatureName(), "locus")==0 && m->getAttr("transcripts")!=NULL) { continue; //discard locus meta-features } if (this->noPseudo) { bool is_pseudo=false; for (int i=0;i<pseudoFeatureIds.Count();++i) { if (pseudoFeatureIds[i]==m->ftype_id) { is_pseudo=true; break; } } if (is_pseudo) continue; for (int i=0;i<pseudoAttrIds.Count();++i) { char* attrv=NULL; if (m->attrs!=NULL) attrv=m->attrs->getAttr(pseudoAttrIds[i]); if (attrv!=NULL) { char fc=tolower(attrv[0]); if (fc=='t' || fc=='y' || fc=='1') { is_pseudo=true; break; } } } if (is_pseudo) continue; //last resort: // scan all the attribute values for "pseudogene" keyword (NCBI does that for "product" attr) /* if (m->attrs!=NULL) { for (int i=0;i<m->attrs->Count();++i) { GffAttr& a=*(m->attrs->Get(i)); if (strifind(a.attr_val, "pseudogene")) { is_pseudo=true; break; } } } if (is_pseudo) continue; */ } //pseudogene detection requested char* rloc=m->getAttr("locus"); if (rloc!=NULL && startsWith(rloc, "RLOC_")) { m->removeAttr("locus", rloc); } /* if (m->exons.Count()==0 && m->children.Count()==0) { //a non-mRNA feature with no subfeatures //add a dummy exon just to have the generic exon checking work m->addExon(m->start,m->end); } */ if (forceExons) { // && m->children.Count()==0) { m->exon_ftype_id=gff_fid_exon; } //GList<GffObj> gfadd(false,false); -- for gf_validate()? if (gf_validate!=NULL && !(*gf_validate)(m, NULL)) { continue; } m->isUsed(true); //so the gffreader won't destroy it int i=-1; GenomicSeqData f(m->gseq_id); GenomicSeqData* gdata=NULL; if (seqdata.Found(&f,i)) gdata=seqdata[i]; else { //entry not created yet for this genomic seq gdata=new GenomicSeqData(m->gseq_id); seqdata.Add(gdata); } /* for (int k=0;k<gfadd.Count();k++) { bool keep=placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan); if (!keep) { gfadd[k]->isUsed(false); //DEBUG GMessage("Feature %s(%d-%d) is going to be discarded..\n",gfadd[k]->getID(), gfadd[k]->start, gfadd[k]->end); } } */ bool keep=placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan); if (!keep) { m->isUsed(false); //DEBUG //GMessage("Feature %s(%d-%d) is going to be discarded..\n",m->getID(), m->start, m->end); } } //for each read gffObj //if (verbose) GMessage(" .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars()); if (f!=stdin) { fclose(f); f=NULL; } delete gffr; }
bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) { //returns true if the transcript passed the filter char* gname=gffrec.getGeneName(); if (gname==NULL) gname=gffrec.getGeneID(); GStr defline(gffrec.getID()); if (f_out && !fmtGTF) { const char* tname=NULL; if ((tname=gffrec.getAttr("transcript_name"))!=NULL) { gffrec.addAttr("Name", tname); gffrec.removeAttr("transcript_name"); } } if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) { const char* biotype=gffrec.getAttr("gene_biotype"); if (biotype) { gffrec.addAttr("type", biotype); gffrec.removeAttr("gene_biotype"); } else { //old Ensembl files lacking gene_biotype gffrec.addAttr("type", gffrec.getTrackName()); } //bool is_gene=false; bool is_pseudo=false; if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS()) gffrec.setFeatureName("mRNA"); else { if (strcmp(biotype, "processed_transcript")==0) gffrec.setFeatureName("proc_RNA"); else { //is_gene=endsWith(biotype, "gene"); is_pseudo=strifind(biotype, "pseudo"); if (is_pseudo) { gffrec.setFeatureName("pseudo_RNA"); } else if (endsWith(biotype, "RNA")) { gffrec.setFeatureName(biotype); } else gffrec.setFeatureName("misc_RNA"); } } } if (gname && strcmp(gname, gffrec.getID())!=0) { int* isonum=isoCounter.Find(gname); if (isonum==NULL) { isonum=new int(1); isoCounter.Add(gname,isonum); } else (*isonum)++; defline.appendfmt(" gene=%s", gname); } int seqlen=0; const char* tlabel=tracklabel; if (tlabel==NULL) tlabel=gffrec.getTrackName(); //defline.appendfmt(" track:%s",tlabel); char* cdsnt = NULL; char* cdsaa = NULL; int aalen=0; for (int i=1;i<gffrec.exons.Count();i++) { int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1; if (ilen>4000000) GMessage("Warning: very large intron (%d) for transcript %s\n", ilen, gffrec.getID()); if (ilen>maxintron) { return false; } } GList<GSeg> seglst(false,true); GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec); if (spliceCheck && gffrec.exons.Count()>1) { //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC ) if (faseq==NULL) GError("Error: no genomic sequence available!\n"); int glen=gffrec.end-gffrec.start+1; const char* gseq=faseq->subseq(gffrec.start, glen); bool revcompl=(gffrec.strand=='-'); bool ssValid=true; for (int e=1;e<gffrec.exons.Count();e++) { const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start; int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1; GSpliceSite acceptorSite(intron,intronlen,true, revcompl); GSpliceSite donorSite(intron,intronlen, false, revcompl); //GMessage("%c intron %d-%d : %s .. %s\n", // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt); if (acceptorSite=="AG") { // GT-AG or GC-AG if (!donorSite.canonicalDonor()) { ssValid=false;break; } } else if (acceptorSite=="AC") { // if (donorSite!="AT") { ssValid=false; break; } } else { ssValid=false; break; } } //GFREE(gseq); if (!ssValid) { if (verbose) GMessage("Invalid splice sites found for '%s'\n",gffrec.getID()); return false; //don't print this one! } } bool trprint=true; int stopCodonAdjust=0; int mCDphase=0; bool hasStop=false; if (gffrec.CDphase=='1' || gffrec.CDphase=='2') mCDphase = gffrec.CDphase-'0'; if (f_y!=NULL || f_x!=NULL || validCDSonly) { if (faseq==NULL) GError("Error: no genomic sequence provided!\n"); //if (protmap && fullCDSonly) { //if (protmap && (fullCDSonly || (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) { if (validCDSonly) { //make sure the stop codon is always included //adjust_stopcodon(gffrec,3); stopCodonAdjust=adjust_stopcodon(gffrec,3); } int strandNum=0; int phaseNum=0; CDS_CHECK: cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, NULL, &seglst); if (cdsnt==NULL) trprint=false; else { //has CDS if (validCDSonly) { cdsaa=translateDNA(cdsnt, aalen, seqlen); char* p=strchr(cdsaa,'.'); hasStop=false; if (p!=NULL) { if (p-cdsaa>=aalen-2) { //stop found as the last codon *p='0';//remove it hasStop=true; if (aalen-2==p-cdsaa) { //previous to last codon is the stop codon //so correct the CDS stop accordingly adjust_stopcodon(gffrec,-3, &seglst); stopCodonAdjust=0; //clear artificial stop adjustment seqlen-=3; cdsnt[seqlen]=0; } aalen=p-cdsaa; } else {//stop found before the last codon trprint=false; } }//stop codon found if (trprint==false) { //failed CDS validity check //in-frame stop codon found if (altPhases && phaseNum<3) { phaseNum++; gffrec.CDphase = '0'+((mCDphase+phaseNum)%3); GFREE(cdsaa); goto CDS_CHECK; } if (gffrec.exons.Count()==1 && bothStrands) { strandNum++; phaseNum=0; if (strandNum<2) { GFREE(cdsaa); gffrec.strand = (gffrec.strand=='-') ? '+':'-'; goto CDS_CHECK; //repeat the CDS check for a different frame } } if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID()); } //has in-frame STOP if (fullCDSonly) { if (!hasStop || cdsaa[0]!='M') trprint=false; } } // CDS check requested } //has CDS } //translation or codon check/output was requested if (!trprint) { GFREE(cdsnt); GFREE(cdsaa); return false; } if (stopCodonAdjust>0 && !hasStop) { //restore stop codon location adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst); if (cdsnt!=NULL && seqlen>0) { seqlen-=stopCodonAdjust; cdsnt[seqlen]=0; } if (cdsaa!=NULL) aalen--; } if (f_y!=NULL) { //CDS translation fasta output requested //char* if (cdsaa==NULL) { //translate now if not done before cdsaa=translateDNA(cdsnt, aalen, seqlen); } if (fullattr && gffrec.attrs!=NULL) { //append all attributes found for each transcripts for (int i=0;i<gffrec.attrs->Count();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); defline.append(gffrec.getAttrValue(i)); } } printFasta(f_y, defline, cdsaa, aalen); } if (f_x!=NULL) { //CDS only if (writeExonSegs) { defline.append(" loc:"); defline.append(gffrec.getGSeqName()); defline.appendfmt("(%c)",gffrec.strand); //warning: not CDS coordinates are written here, but the exon ones defline+=(int)gffrec.start; defline+=(char)'-'; defline+=(int)gffrec.end; // -- here these are CDS substring coordinates on the spliced sequence: defline.append(" segs:"); for (int i=0;i<seglst.Count();i++) { if (i>0) defline.append(","); defline+=(int)seglst[i]->start; defline.append("-"); defline+=(int)seglst[i]->end; } } if (fullattr && gffrec.attrs!=NULL) { //append all attributes found for each transcript for (int i=0;i<gffrec.attrs->Count();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); defline.append(gffrec.getAttrValue(i)); } } printFasta(f_x, defline, cdsnt, seqlen); } GFREE(cdsnt); GFREE(cdsaa); if (f_w!=NULL) { //write spliced exons uint cds_start=0; uint cds_end=0; seglst.Clear(); char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst); if (exont!=NULL) { if (gffrec.CDstart>0) { defline.appendfmt(" CDS=%d-%d", cds_start, cds_end); } if (writeExonSegs) { defline.append(" loc:"); defline.append(gffrec.getGSeqName()); defline+=(char)'|'; defline+=(int)gffrec.start; defline+=(char)'-'; defline+=(int)gffrec.end; defline+=(char)'|'; defline+=(char)gffrec.strand; defline.append(" exons:"); for (int i=0;i<gffrec.exons.Count();i++) { if (i>0) defline.append(","); defline+=(int)gffrec.exons[i]->start; defline.append("-"); defline+=(int)gffrec.exons[i]->end; } defline.append(" segs:"); for (int i=0;i<seglst.Count();i++) { if (i>0) defline.append(","); defline+=(int)seglst[i]->start; defline.append("-"); defline+=(int)seglst[i]->end; } } if (fullattr && gffrec.attrs!=NULL) { //append all attributes found for each transcripts for (int i=0;i<gffrec.attrs->Count();i++) { defline.append(" "); defline.append(gffrec.getAttrName(i)); defline.append("="); defline.append(gffrec.getAttrValue(i)); } } printFasta(f_w, defline, exont, seqlen); GFREE(exont); } } //writing f_w (spliced exons) return true; }
int t1_rlist(REMAILER remailer[], int badchains[MAXREM][MAXREM]) { FILE *list, *excl; int i, listed = 0; int n = 0; char line[2 * LINELEN], l2[LINELEN], name[LINELEN], *flags; BUFFER *starex; starex = buf_new(); excl = mix_openfile(STAREX, "r"); if (excl != NULL) { buf_read(starex, excl); fclose(excl); } list = mix_openfile(TYPE1LIST, "r"); if (list == NULL) { buf_free(starex); return (-1); } while (fgets(line, sizeof(line), list) != NULL && n < MAXREM) { if (strleft(line, "$remailer") && strchr(line, '<') && strchr(line, '>') && strchr(line, '{') && strchr(line, '{') + 4 < strchr(line, '}')) { if (line[strlen(line) - 1] == '\n') line[strlen(line) - 1] = '\0'; if (line[strlen(line) - 1] == '\r') line[strlen(line) - 1] = '\0'; while (line[strlen(line) - 1] == ' ') line[strlen(line) - 1] = '\0'; if (line[strlen(line) - 1] != ';' && fgets(l2, sizeof(l2), list) != NULL) strcatn(line, l2, LINELEN); flags = strchr(line, '>'); strncpy(name, strchr(line, '{') + 2, strchr(line, '}') - strchr(line, '{') - 3); name[strchr(line, '}') - strchr(line, '{') - 3] = '\0'; name[20] = '\0'; for (i = 1; i <= n; i++) if (streq(name, remailer[i].name)) break; if (i > n) { /* not in mix list */ n++; strcpy(remailer[i].name, name); strncpy(remailer[i].addr, strchr(line, '<') + 1, strchr(line, '>') - strchr(line, '<')); remailer[i].addr[strchr(line, '>') - strchr(line, '<') - 1] = '\0'; remailer[i].flags.mix = 0; remailer[i].flags.post = strifind(flags, " post"); } remailer[i].flags.cpunk = strfind(flags, " cpunk"); remailer[i].flags.pgp = strfind(flags, " pgp"); remailer[i].flags.pgponly = strfind(flags, " pgponly"); remailer[i].flags.latent = strfind(flags, " latent"); remailer[i].flags.middle = strfind(flags, " middle"); remailer[i].flags.ek = strfind(flags, " ek"); remailer[i].flags.esub = strfind(flags, " esub"); remailer[i].flags.hsub = strfind(flags, " hsub"); remailer[i].flags.newnym = strfind(flags, " newnym"); remailer[i].flags.nym = strfind(flags, " nym"); remailer[i].info[1].reliability = 0; remailer[i].info[1].latency = 0; remailer[i].info[1].history[0] = '\0'; remailer[i].flags.star_ex = bufifind(starex, name); } if (strleft(line, "-----------------------------------------------------------------------")) break; } n++; /* ?? */ while (fgets(line, sizeof(line), list) != NULL) { if (strlen(line) >= 72 && strlen(line) <= 73) for (i = 1; i < n; i++) if (strleft(line, remailer[i].name) && line[strlen(remailer[i].name)] == ' ') { strncpy(remailer[i].info[1].history, line + 42, 12); remailer[i].info[1].history[12] = '\0'; remailer[i].info[1].reliability = 10000 * N(line[64]) + 1000 * N(line[65]) + 100 * N(line[66]) + 10 * N(line[68]) + N(line[69]); remailer[i].info[1].latency = 36000 * N(line[55]) + 3600 * N(line[56]) + 600 * N(line[58]) + 60 * N(line[59]) + 10 * N(line[61]) + N(line[62]); listed++; } } fclose(list); parse_badchains(badchains, TYPE1LIST, "Broken type-I remailer chains", remailer, n); if (listed < 4) /* we have no valid reliability info */ for (i = 1; i < n; i++) remailer[i].info[1].reliability = 10000; #ifdef USE_PGP pgp_rlist(remailer, n); #endif /* USE_PGP */ buf_free(starex); return (n); }