Ejemplo n.º 1
0
void faLowerToN(char *inName, char *outName)
/* faLowerToN - Convert lower case bases to N.. */
{
struct lineFile *lf = lineFileOpen(inName, TRUE);
FILE *f = mustOpen(outName, "w");
char *line;
while (lineFileNext(lf, &line, NULL))
    {
    if (line[0] != '>')
       lowerToN(line);
    fprintf(f, "%s\n", line);
    }
carefulClose(&f);
}
Ejemplo n.º 2
0
void seqFromPsl(char *inPsl, char *inTwoBit, char *outFa)
/* seqFromPsl - Extract masked sequence from database corresponding to psl file. */
{
struct twoBitFile *tbf = twoBitOpen(inTwoBit);
struct lineFile *lf = pslFileOpen(inPsl);
FILE *f = mustOpen(outFa, "w");
struct psl *psl;

while ((psl = pslNext(lf)) != NULL)
    {
    char faHead[512];
    struct dnaSeq *seq = twoBitReadSeqFrag(tbf, psl->tName,
    	psl->tStart, psl->tEnd);
    if (psl->strand[0] == '-')
        reverseComplement(seq->dna, seq->size);
    safef(faHead, sizeof(faHead), "%s (%s:%d-%d)", 
    	psl->qName, psl->tName, psl->tStart+1, psl->tEnd);
    if (hardMask)
        lowerToN(seq->dna, seq->size);
    faWriteNext(f, faHead, seq->dna, seq->size);
    }
carefulClose(&f);
}
static void hgSeqConcatRegionsDb(char *db, char *chrom, int chromSize, char strand, char *name,
                                 int rCount, unsigned *rStarts, unsigned *rSizes,
                                 boolean *exonFlags, boolean *cdsFlags)
/* Concatenate and print out dna for a series of regions. */
{
// Note: this code use to generate different sequence ids if the global
// database in hdb was different than the db parameter.  This functionality
// has been removed since the global database was removed and it didn't
// appear to be used.

    struct dnaSeq *rSeq = NULL;
    struct dnaSeq *cSeq = NULL;
    char recName[256];
    int seqStart, seqEnd;
    int offset, cSize;
    int i;
    boolean isRc     = (strand == '-') || cgiBoolean("hgSeq.revComp");
    boolean maskRep  = cgiBoolean("hgSeq.maskRepeats");
    int padding5     = cgiOptionalInt("hgSeq.padding5", 0);
    int padding3     = cgiOptionalInt("hgSeq.padding3", 0);
    char *casing     = cgiString("hgSeq.casing");
    char *repMasking = cgiString("hgSeq.repMasking");
    char *granularity  = cgiOptionalString("hgSeq.granularity");
    boolean concatRegions = granularity && sameString("gene", granularity);

    if (rCount < 1)
        return;

    /* Don't support padding if granularity is gene (i.e. concat'ing all). */
    if (concatRegions)
    {
        padding5 = padding3 = 0;
    }

    i = rCount - 1;
    seqStart = rStarts[0]             - (isRc ? padding3 : padding5);
    seqEnd   = rStarts[i] + rSizes[i] + (isRc ? padding5 : padding3);
    /* Padding might push us off the edge of the chrom; if so, truncate: */
    if (seqStart < 0)
    {
        if (isRc)
            padding3 += seqStart;
        else
            padding5 += seqStart;
        seqStart = 0;
    }

    /* if we know the chromSize, don't pad out beyond it */
    if ((chromSize > 0) && (seqEnd > chromSize))
    {
        if (isRc)
            padding5 += (chromSize - seqEnd);
        else
            padding3 += (chromSize - seqEnd);
        seqEnd = chromSize;
    }
    if (seqEnd <= seqStart)
    {
        printf("# Null range for %s_%s (range=%s:%d-%d 5'pad=%d 3'pad=%d) (may indicate a query-side insert)\n",
               db,
               name,
               chrom, seqStart+1, seqEnd,
               padding5, padding3);
        return;
    }
    if (maskRep)
    {
        rSeq = hDnaFromSeq(db, chrom, seqStart, seqEnd, dnaMixed);
        if (sameString(repMasking, "N"))
            lowerToN(rSeq->dna, strlen(rSeq->dna));
        if (!sameString(casing, "upper"))
            tolowers(rSeq->dna);
    }
    else if (sameString(casing, "upper"))
        rSeq = hDnaFromSeq(db, chrom, seqStart, seqEnd, dnaUpper);
    else
        rSeq = hDnaFromSeq(db, chrom, seqStart, seqEnd, dnaLower);

    /* Handle casing and compute size of concatenated sequence */
    cSize = 0;
    for (i=0;  i < rCount;  i++)
    {
        if ((sameString(casing, "exon") && exonFlags[i]) ||
                (sameString(casing, "cds") && cdsFlags[i]))
        {
            int rStart = rStarts[i] - seqStart;
            toUpperN(rSeq->dna+rStart, rSizes[i]);
        }
        cSize += rSizes[i];
    }
    cSize += (padding5 + padding3);
    AllocVar(cSeq);
    cSeq->dna = needLargeMem(cSize+1);
    cSeq->size = cSize;

    offset = 0;
    for (i=0;  i < rCount;  i++)
    {
        int start = rStarts[i] - seqStart;
        int size  = rSizes[i];
        if (i == 0)
        {
            start -= (isRc ? padding3 : padding5);
            assert(start == 0);
            size  += (isRc ? padding3 : padding5);
        }
        if (i == rCount-1)
        {
            size  += (isRc ? padding5 : padding3);
        }
        memcpy(cSeq->dna+offset, rSeq->dna+start, size);
        offset += size;
    }
    assert(offset == cSeq->size);
    cSeq->dna[offset] = 0;
    freeDnaSeq(&rSeq);

    if (isRc)
        reverseComplement(cSeq->dna, cSeq->size);

    safef(recName, sizeof(recName),
          "%s_%s range=%s:%d-%d 5'pad=%d 3'pad=%d "
          "strand=%c repeatMasking=%s",
          db,
          name,
          chrom, seqStart+1, seqEnd,
          padding5, padding3,
          (isRc ? '-' : '+'),
          (maskRep ? repMasking : "none"));
    faWriteNext(stdout, recName, cSeq->dna, cSeq->size);
    freeDnaSeq(&cSeq);
}