void faLowerToN(char *inName, char *outName) /* faLowerToN - Convert lower case bases to N.. */ { struct lineFile *lf = lineFileOpen(inName, TRUE); FILE *f = mustOpen(outName, "w"); char *line; while (lineFileNext(lf, &line, NULL)) { if (line[0] != '>') lowerToN(line); fprintf(f, "%s\n", line); } carefulClose(&f); }
void seqFromPsl(char *inPsl, char *inTwoBit, char *outFa) /* seqFromPsl - Extract masked sequence from database corresponding to psl file. */ { struct twoBitFile *tbf = twoBitOpen(inTwoBit); struct lineFile *lf = pslFileOpen(inPsl); FILE *f = mustOpen(outFa, "w"); struct psl *psl; while ((psl = pslNext(lf)) != NULL) { char faHead[512]; struct dnaSeq *seq = twoBitReadSeqFrag(tbf, psl->tName, psl->tStart, psl->tEnd); if (psl->strand[0] == '-') reverseComplement(seq->dna, seq->size); safef(faHead, sizeof(faHead), "%s (%s:%d-%d)", psl->qName, psl->tName, psl->tStart+1, psl->tEnd); if (hardMask) lowerToN(seq->dna, seq->size); faWriteNext(f, faHead, seq->dna, seq->size); } carefulClose(&f); }
static void hgSeqConcatRegionsDb(char *db, char *chrom, int chromSize, char strand, char *name, int rCount, unsigned *rStarts, unsigned *rSizes, boolean *exonFlags, boolean *cdsFlags) /* Concatenate and print out dna for a series of regions. */ { // Note: this code use to generate different sequence ids if the global // database in hdb was different than the db parameter. This functionality // has been removed since the global database was removed and it didn't // appear to be used. struct dnaSeq *rSeq = NULL; struct dnaSeq *cSeq = NULL; char recName[256]; int seqStart, seqEnd; int offset, cSize; int i; boolean isRc = (strand == '-') || cgiBoolean("hgSeq.revComp"); boolean maskRep = cgiBoolean("hgSeq.maskRepeats"); int padding5 = cgiOptionalInt("hgSeq.padding5", 0); int padding3 = cgiOptionalInt("hgSeq.padding3", 0); char *casing = cgiString("hgSeq.casing"); char *repMasking = cgiString("hgSeq.repMasking"); char *granularity = cgiOptionalString("hgSeq.granularity"); boolean concatRegions = granularity && sameString("gene", granularity); if (rCount < 1) return; /* Don't support padding if granularity is gene (i.e. concat'ing all). */ if (concatRegions) { padding5 = padding3 = 0; } i = rCount - 1; seqStart = rStarts[0] - (isRc ? padding3 : padding5); seqEnd = rStarts[i] + rSizes[i] + (isRc ? padding5 : padding3); /* Padding might push us off the edge of the chrom; if so, truncate: */ if (seqStart < 0) { if (isRc) padding3 += seqStart; else padding5 += seqStart; seqStart = 0; } /* if we know the chromSize, don't pad out beyond it */ if ((chromSize > 0) && (seqEnd > chromSize)) { if (isRc) padding5 += (chromSize - seqEnd); else padding3 += (chromSize - seqEnd); seqEnd = chromSize; } if (seqEnd <= seqStart) { printf("# Null range for %s_%s (range=%s:%d-%d 5'pad=%d 3'pad=%d) (may indicate a query-side insert)\n", db, name, chrom, seqStart+1, seqEnd, padding5, padding3); return; } if (maskRep) { rSeq = hDnaFromSeq(db, chrom, seqStart, seqEnd, dnaMixed); if (sameString(repMasking, "N")) lowerToN(rSeq->dna, strlen(rSeq->dna)); if (!sameString(casing, "upper")) tolowers(rSeq->dna); } else if (sameString(casing, "upper")) rSeq = hDnaFromSeq(db, chrom, seqStart, seqEnd, dnaUpper); else rSeq = hDnaFromSeq(db, chrom, seqStart, seqEnd, dnaLower); /* Handle casing and compute size of concatenated sequence */ cSize = 0; for (i=0; i < rCount; i++) { if ((sameString(casing, "exon") && exonFlags[i]) || (sameString(casing, "cds") && cdsFlags[i])) { int rStart = rStarts[i] - seqStart; toUpperN(rSeq->dna+rStart, rSizes[i]); } cSize += rSizes[i]; } cSize += (padding5 + padding3); AllocVar(cSeq); cSeq->dna = needLargeMem(cSize+1); cSeq->size = cSize; offset = 0; for (i=0; i < rCount; i++) { int start = rStarts[i] - seqStart; int size = rSizes[i]; if (i == 0) { start -= (isRc ? padding3 : padding5); assert(start == 0); size += (isRc ? padding3 : padding5); } if (i == rCount-1) { size += (isRc ? padding5 : padding3); } memcpy(cSeq->dna+offset, rSeq->dna+start, size); offset += size; } assert(offset == cSeq->size); cSeq->dna[offset] = 0; freeDnaSeq(&rSeq); if (isRc) reverseComplement(cSeq->dna, cSeq->size); safef(recName, sizeof(recName), "%s_%s range=%s:%d-%d 5'pad=%d 3'pad=%d " "strand=%c repeatMasking=%s", db, name, chrom, seqStart+1, seqEnd, padding5, padding3, (isRc ? '-' : '+'), (maskRep ? repMasking : "none")); faWriteNext(stdout, recName, cSeq->dna, cSeq->size); freeDnaSeq(&cSeq); }