void txCdsToGene(char *txBed, char *txFa, char *txCds, char *outGtf, char *outFa) /* txCdsToGene - Convert transcript bed and best cdsEvidence to genePred and * protein sequence. */ { struct hash *txSeqHash = faReadAllIntoHash(txFa, dnaLower); verbose(2, "Read %d transcript sequences from %s\n", txSeqHash->elCount, txFa); struct hash *cdsHash = cdsEvidenceReadAllIntoHash(txCds); verbose(2, "Read %d cdsEvidence from %s\n", cdsHash->elCount, txCds); struct lineFile *lf = lineFileOpen(txBed, TRUE); FILE *fGtf = mustOpen(outGtf, "w"); FILE *fFa = mustOpen(outFa, "w"); char *row[12]; while (lineFileRow(lf, row)) { struct bed *bed = bedLoad12(row); verbose(2, "processing %s\n", bed->name); struct cdsEvidence *cds = hashFindVal(cdsHash, bed->name); struct dnaSeq *txSeq = hashFindVal(txSeqHash, bed->name); char *cdsSource = NULL; if (txSeq == NULL) errAbort("%s is in %s but not %s", bed->name, txBed, txFa); if (cds != NULL) { outputProtein(cds, txSeq, fFa); if (cds->cdsCount > 1) { struct bed *newBed = breakUpBedAtCdsBreaks(cds, bed); if (fTweaked) fprintf(fTweaked, "%s\n", newBed->name); bedFree(&bed); bed = newBed; } cdsSource = cds->accession; if (sameString(cds->accession, ".")) cdsSource = cds->source; } /* Set bed CDS bounds and optionally output bed. */ cdsEvidenceSetBedThick(cds, bed); if (fBed) bedTabOutN(bed, 12, fBed); /* Parse out bed name, which is in format chrom.geneId.txId.accession */ char *geneName = cloneString(bed->name); char *accession = strrchr(geneName, '.'); assert(accession != NULL); *accession++ = 0; chopSuffix(geneName); /* Output as GTF */ bedToGtf(bed, accession, cdsSource, geneName, fGtf); /* Clean up for next iteration of loop. */ freez(&geneName); bedFree(&bed); } lineFileClose(&lf); carefulClose(&fFa); carefulClose(&fGtf); }
static void makeDirFasta(char *regionsFile, char *hg18FastaFile, char *dir, int num) { FILE *fp, *sq; char buf[500], dirName[500], seqName[500], chr1[500], chr2[500]; int b1, e1, b2, e2, i, len; char ori1, ori2; struct hash *seqHash = NULL; struct dnaSeq *seq1, *seq2; struct stat st; DNA *s1, *s2; seqHash = faReadAllIntoHash(hg18FastaFile, dnaUpper); if (stat(dir, &st) != 0) do_cmd("mkdir %s", dir); fp = mustOpen(regionsFile, "r"); i = 0; while (fgets(buf, 500, fp)) { if (sscanf(buf, "%[^:]:%d-%d %[^:]:%d-%d [%c %c]", chr1, &b1, &e1, chr2, &b2, &e2, &ori1, &ori2) != 8) errAbort("error: %s", buf); ++i; if (i != num) continue; sprintf(dirName, "%s/R%d", dir, i); if (stat(dirName, &st) != 0) do_cmd("mkdir %s", dir); sprintf(seqName, "%s/ref.fa", dirName); sq = mustOpen(seqName, "w"); fprintf(sq, ">%s:%d-%d+%s:%d-%d[%c%c]\n", chr1, b1, e1, chr2, b2, e2, ori1, ori2); seq1 = (struct dnaSeq *)hashFindVal(seqHash, chr1); assert(e1 <= seq1->size); len = e1 - b1 + 1; if (ori1 == '-') { s1 = cloneStringZExt(seq1->dna + b1 - 1, len, len+1); reverseComplement(s1, len); writeSeqWithBreaks(sq, s1, len, 80); freeMem(s1); } else writeSeqWithBreaks(sq, seq1->dna + b1 - 1, e1 - b1 + 1, 80); seq2 = (struct dnaSeq *)hashFindVal(seqHash, chr2); assert(e2 <= seq2->size); len = e2 - b2 + 1; if (ori2 == '-') { s2 = cloneStringZExt(seq2->dna + b2 - 1, len, len+1); reverseComplement(s2, len); writeSeqWithBreaks(sq, s2, len, 80); freeMem(s2); } else writeSeqWithBreaks(sq, seq2->dna + b2 - 1, e2 - b2 + 1, 80); fclose(sq); } fclose(fp); //FIXME: free space }
void txGeneAltProt(char *pepFile, char *isoformsFile, char *outFile) /* txGeneAltProt - Figure out statistics on number of alternative proteins produced by alt-splicing.. */ { struct hash *pepHash = faReadAllIntoHash(pepFile, dnaUpper); struct hash *totalUniqHash = hashNew(18); uglyf("Read %d from %s\n", pepHash->elCount, pepFile); int lastClusterId = -1; struct hash *uniqHash = NULL; struct slName *clusterList = NULL; FILE *f = mustOpen(outFile, "w"); struct lineFile *lf = lineFileOpen(isoformsFile, TRUE); char *row[2]; while (lineFileRow(lf, row)) { int clusterId = lineFileNeedNum(lf, row, 0); char *tx = row[1]; if (clusterId != lastClusterId) { if (uniqHash != NULL) { outputCluster(lastClusterId, clusterList, f); hashFree(&uniqHash); slFreeList(&clusterList); } uniqHash = hashNew(0); } lastClusterId = clusterId; struct dnaSeq *pep = hashFindVal(pepHash, tx); if (pep != NULL) { if (!hashLookup(uniqHash, pep->dna)) { hashAdd(uniqHash, pep->dna, NULL); slNameAddTail(&clusterList, tx); } if (!hashLookup(totalUniqHash, pep->dna)) hashAdd(totalUniqHash, pep->dna, NULL); } } outputCluster(lastClusterId, clusterList, f); verbose(1, "%d total unique proteins\n", totalUniqHash->elCount); carefulClose(&f); }
void txCdsOrfInfo(char *inCds, char *inFa, char *outInfo) /* txCdsOrfInfo - Given a sequence and a putative ORF, calculate some basic information on it.. */ { struct hash *dnaHash = faReadAllIntoHash(inFa, dnaLower); struct lineFile *lf = lineFileOpen(inCds, TRUE); FILE *f = mustOpen(outInfo, "w"); char *row[3]; while (lineFileRow(lf, row)) { char *seqName = row[0]; int start = lineFileNeedNum(lf, row, 1); int end = lineFileNeedNum(lf, row, 2); struct dnaSeq *seq = hashFindVal(dnaHash, seqName); if (seq == NULL) errAbort("%s is in %s but not %s", seqName, inCds, inFa); outputOneRa(seq, start, end, f); } carefulClose(&f); }
void txCdsEvFromBorf(char *inBorf, char *txFa, char *outTce) /* txCdsEvFromBorf - Convert borfBig format to txCdsEvidence (tce) in an effort * to annotate the coding regions.. */ { struct lineFile *lf = lineFileOpen(inBorf, TRUE); struct hash *txHash = faReadAllIntoHash(txFa, dnaLower); char *row[BORF_NUM_COLS]; FILE *f = mustOpen(outTce, "w"); while (lineFileRowTab(lf, row)) { struct borf b; borfStaticLoad(row, &b); if (b.strand[0] == '+' && b.score >= 50) { struct dnaSeq *txSeq = hashFindVal(txHash, b.name); boolean hasStop = FALSE; if (b.cdsEnd + 3 < txSeq->size) { hasStop = isStopCodon(txSeq->dna + b.cdsEnd); b.cdsEnd += 3; } if (txSeq == NULL) errAbort("%s is in %s but not %s", b.name, inBorf, txFa); int score = (b.score - 45)*5; if (score > 1000) score = 1000; if (score < 0) score = 0; fprintf(f, "%s\t", b.name); fprintf(f, "%d\t", b.cdsStart); fprintf(f, "%d\t", b.cdsEnd); fprintf(f, "%s\t", "bestorf"); fprintf(f, "%s\t", "."); fprintf(f, "%d\t", score); fprintf(f, "%d\t", startsWith("atg", txSeq->dna + b.cdsStart)); fprintf(f, "%d\t", hasStop); fprintf(f, "%d\t", 1); fprintf(f, "%d,\t", b.cdsStart); fprintf(f, "%d,\n", b.cdsEnd - b.cdsStart); } } lineFileClose(&lf); carefulClose(&f); }
void txGeneFromBed(char *inBed, char *inPicks, char *ucscFa, char *uniProtFa, char *refPepFa, char *outKg) /* txGeneFromBed - Convert from bed to knownGenes format table (genePred + uniProt ID). */ { /* Load protein sequence into hashes */ struct hash *uniProtHash = faReadAllIntoHash(uniProtFa, dnaUpper); struct hash *ucscProtHash = faReadAllIntoHash(ucscFa, dnaUpper); struct hash *refProtHash =faReadAllIntoHash(refPepFa, dnaUpper); /* Load picks into hash. We don't use cdsPicksLoadAll because empty fields * cause that autoSql-generated routine problems. */ struct hash *pickHash = newHash(18); struct cdsPick *pick; struct lineFile *lf = lineFileOpen(inPicks, TRUE); char *row[CDSPICK_NUM_COLS]; while (lineFileRowTab(lf, row)) { pick = cdsPickLoad(row); hashAdd(pickHash, pick->name, pick); } /* Load in bed */ struct bed *bed, *bedList = bedLoadNAll(inBed, 12); /* Do reformatting and write output. */ FILE *f = mustOpen(outKg, "w"); for (bed = bedList; bed != NULL; bed = bed->next) { char *protAcc = NULL; if (bed->thickStart < bed->thickEnd) { pick = hashMustFindVal(pickHash, bed->name); struct dnaSeq *spSeq = NULL, *uniSeq = NULL, *refPep = NULL, *ucscSeq; ucscSeq = hashMustFindVal(ucscProtHash, bed->name); if (pick->swissProt[0]) spSeq = hashMustFindVal(uniProtHash, pick->swissProt); if (pick->uniProt[0]) uniSeq = hashMustFindVal(uniProtHash, pick->uniProt); if (pick->refProt[0]) refPep = hashMustFindVal(refProtHash, pick->refProt); /* First we look for an exact match between the ucsc protein and * something from swissProt/uniProt. */ if (spSeq != NULL && sameString(ucscSeq->dna, spSeq->dna)) protAcc = pick->swissProt; if (protAcc == NULL && uniSeq != NULL && sameString(ucscSeq->dna, uniSeq->dna)) protAcc = pick->uniProt; if (protAcc == NULL && refPep != NULL && sameString(ucscSeq->dna, refPep->dna)) { protAcc = cloneString(pick->refProt); chopSuffix(protAcc); } if (protAcc == NULL) { if (pick->uniProt[0]) protAcc = pick->uniProt; else { protAcc = cloneString(pick->refProt); chopSuffix(protAcc); } } } outputKg(bed, emptyForNull(protAcc), f); } carefulClose(&f); }