void readFa(char *fileName, struct rnaCover **retList, struct hash **retHash)
/* Read in an FA file and store name and size of every record in hash/list. */
{
struct rnaCover *list = NULL, *rc;
struct hash *hash = newHash(18);
struct lineFile *lf = lineFileOpen(fileName, TRUE);
DNA *dna;
int size;
char *name;

while (faSpeedReadNext(lf, &dna, &size, &name))
    {
    if (size >= minSize)
	{
	AllocVar(rc);
	slAddHead(&list, rc);
	if (hashLookup(hash, name))
	    {
	    warn("Duplicate %s line %d of %s, skipping", name, lf->lineIx, lf->fileName);
	    continue;
	    }
	hashAddSaveName(hash, name, rc, &rc->name);
	rc->qSize = size;
	}
    }
slReverse(&list);
*retList = list;
*retHash = hash;
}
void splitNcbiFa(char *ncbiIn, char *outDir)
/* splitNcbiFa - Split up NCBI format fa file into UCSC formatted ones.. */
{
struct lineFile *lf = lineFileOpen(ncbiIn, TRUE);
static struct dnaSeq seq;
ZeroVar(&seq);

makeDir(outDir);
while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
    {
    FILE *f;
    char fileName[512];
    char *row[5];
    int wordCount;
    char ourName[129];
    char cloneName[128];

    wordCount = chopByChar(seq.name, '|', row, ArraySize(row));
    if (wordCount != 5)
        errAbort("Expecting 5 | separated fields line %d of %s", lf->lineIx, lf->fileName);
    strcpy(cloneName, row[3]);
    chopSuffix(cloneName);
    sprintf(fileName, "%s/%s.fa", outDir, cloneName);
    sprintf(ourName, "%s_1", row[3]);
    faWrite(fileName, ourName, seq.dna, seq.size);
    }
}
Пример #3
0
void polyInfo(char *pslFile, char *genoFile, char *estFile, char *outputFile)
/* polyInfo - Collect info on polyAdenylation signals etc. */
{
struct hash *pslHash = NULL;
struct hash *genoHash = loadGeno(genoFile);
static struct dnaSeq est;
struct lineFile *lf = NULL;
FILE *f = NULL;

pslHash = pslIntoHash(pslFile);
lf = lineFileOpen(estFile, TRUE);
f = mustOpen(outputFile, "w");

while (faSpeedReadNext(lf, &est.dna, &est.size, &est.name))
    {
    struct pslList *pl;
    struct psl *psl;
    struct estOrientInfo ei;
    if ((pl = hashFindVal(pslHash, est.name)) != NULL)
        {
	for (psl = pl->list; psl != NULL; psl = psl->next)
	    {
            struct dnaSeq *geno = hashMustFindVal(genoHash, psl->tName);
	    if (psl->tSize != geno->size)
	        errAbort("psl generated on a different version of the genome");
	    ZeroVar(&ei);
	    fillInEstInfo(&ei, &est, geno, psl);
	    estOrientInfoTabOut(&ei, f);
	    }
	}
    }
}
Пример #4
0
struct frag *readFragList(char *fileName)
/* Read list of frags from file. */
{
struct frag *list = NULL, *frag;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct dnaSeq seq;
char *s;
int fragIx;
struct hash *chromHash = newHash(5);
ZeroVar(&seq);

printf("Reading %s\n", fileName);
while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
    {
    AllocVar(frag);
    frag->name = cloneString(seq.name);
    s = strrchr(seq.name, '_');
    if (s == NULL || !isdigit(s[1]))
        errAbort("Expecting _ and number in %s", seq.name);
    fragIx = atoi(s+1);
    frag->chrom = "chr14";
    frag->start = fragIx*1000;
    frag->end = frag->start + 1000;
    slAddHead(&list, frag);
    }
lineFileClose(&lf);
printf("Read %d fragments from %s\n", slCount(list), fileName);
slReverse(&list);
return list;
}
void countSeq(char *fileName, int *retSeqCount, int *retBaseCount)
/* Count bases and sequences in fa file. */
{
int seqCount = 0, baseCount = 0, oneSize;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
DNA *dna;
char *name;

while (faSpeedReadNext(lf, &dna, &oneSize, &name))
    {
    seqCount += 1;
    baseCount += oneSize;
    }

lineFileClose(&lf);
*retSeqCount = seqCount;
*retBaseCount = baseCount;
}
Пример #6
0
void correctEst(char *oldFa, char *pslFile, char *nibDir, char *outFa)
/* correctEst - Correct ESTs by passing them through genome. */
{
struct hash *pslHash = hashPsls(pslFile);
struct lineFile *lf = lineFileOpen(oldFa, FALSE);
FILE *f = mustOpen(outFa, "w");
static struct dnaSeq est;
struct hashEl *hel;
struct psl *psl;
struct hash *nibHash = newHash(8);

while (faSpeedReadNext(lf, &est.dna, &est.size, &est.name))
    {
    if ((psl = hashFindVal(pslHash, est.name)) != NULL)
        {
	correctOne(&est, psl, nibDir, nibHash, f);
	}
    else
        {
	faWriteNext(f, est.name, est.dna, est.size);
	}
    }
}
void gsBig(char *faName, char *gtfName, 
	   char *suboptName, 
	   char *transName,
	   char *exeName, 
	   char *parName,
	   char *tmpDirName)
/* gsBig - Run Genscan on big input and produce GTF files. */
{
struct dnaSeq seq;
struct lineFile *lf = lineFileOpen(faName, TRUE);
FILE *gtfFile = mustOpen(gtfName, "w");
FILE *subFile = NULL;
FILE *transFile = NULL;
ZeroVar(&seq);

if (suboptName != NULL)
    subFile = mustOpen(suboptName, "w");
if (transName != NULL)
    transFile = mustOpen(transName, "w");
if (exeName != NULL)
    exePath = cloneString(exeName);
if (parName != NULL)
        parPath = cloneString(parName);	
if (tmpDirName != NULL)
        tmpDir = cloneString(tmpDirName);
	
if (optionExists("prerun"))
    {
    char *preFileName = optionVal("prerun", NULL);
    char seqName[128];
    struct segment *seg = parseSegment(preFileName, 0, 100000000, seqName);
    writeSeg(seqName, seg, gtfFile, subFile, transFile);
    }
else
    {
    struct dyString *dy = newDyString(1024);
    char tempFa[512], tempGs[512];
    char dir1[256], root1[128], ext1[64];
    int myPid = (int)getpid();

    splitPath(faName, dir1, root1, ext1);
    while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
	{
	int offset, sizeOne;
	struct segment *segList = NULL, *seg;
	char *seqName = cloneString(seq.name);
	int chunkNum = 0;

	for (offset = 0; offset < seq.size; offset += stepSize)
	    {
	    boolean allN = TRUE;
	    int i;
	    safef(tempFa, sizeof(tempFa), "%s/temp_gsBig_%d_%s_%d.fa",
		  tmpDir, myPid, seqName, chunkNum);
	    safef(tempGs, sizeof(tempGs), "%s/temp_gsBig_%d_%s_%d.genscan",
		  tmpDir, myPid, seqName, chunkNum);
	    sizeOne = seq.size - offset;
	    if (sizeOne > winSize) sizeOne = winSize;
	    /* Genscan hangs forever if a chunk is all-N's... if so, 
	     * then skip this chunk. */
	    for (i=offset;  i < (offset+sizeOne);  i++)
		{
		if (seq.dna[i] != 'N' && seq.dna[i] != 'n')
		    {
		    allN = FALSE;
		    break;
		    }
		}
	    if (allN)
		{
		printf("\ngsBig: skipping %s[%d:%d] -- it's all N's.\n\n",
		       seqName, offset, (offset+sizeOne-1));
		}
	    else
		{
		faWrite(tempFa, "split", seq.dna + offset, sizeOne); 
		dyStringClear(dy);
		dyStringPrintf(dy, "%s %s %s", exePath, parPath, tempFa);
		if (suboptName != NULL)
		    dyStringPrintf(dy, " -subopt");
		dyStringPrintf(dy, " > %s", tempGs);
		verbose(3, "%s\n", dy->string);
		mustSystem(dy->string);
		seg = parseSegment(tempGs, offset, offset+sizeOne, NULL);
		slAddHead(&segList, seg);
		}
	    chunkNum++;
	    }
	slReverse(&segList);
	seg = mergeSegs(segList);
	writeSeg(seqName, seg, gtfFile, subFile, transFile);
	freez(&seqName);
	}
    if (! optionExists("noRemove"))
	{
	remove(tempFa);
	remove(tempGs);
	}
    }
}
void faCount(char *faFiles[], int faCount)
/* faCount - count bases. */
{
int f, i, j, k;
struct dnaSeq seq;
unsigned long long totalLength = 0;
unsigned long long totalBaseCount[5];
unsigned long long totalDinucleotideCount[5][5];
unsigned long long totalCpgCount = 0;
struct lineFile *lf;
ZeroVar(&seq);

for (i = 0; i < ArraySize(totalBaseCount); i++)
    totalBaseCount[i] = 0;

for (i = 0; i < ArraySize(totalDinucleotideCount); i++)
    for (j = 0; j < ArraySize(totalDinucleotideCount[i]); j++)
        totalDinucleotideCount[i][j] = 0;

printf("#seq\tlen\tA\tC\tG\tT\tN\tcpg");
if (dinuc)
    printf("\tAA\tAC\tAG\tAT\tCA\tCC\tCG\tCT\tGA\tGC\tGG\tGT\tTA\tTC\tTG\tTT");
printf("\n");

dnaUtilOpen();
for (f = 0; f<faCount; ++f)
    {
    lf = lineFileOpen(faFiles[f], FALSE);
    while (faSpeedReadNext(lf, &seq.dna, &seq.size, &seq.name))
        {
        int prevBase = -1;
        int prevRcBase = -1;
        unsigned long long length = 0;
        unsigned long long baseCount[5];
        unsigned long long dinucleotideCount[5][5];
        unsigned long long cpgCount = 0;
        for (i = 0; i < ArraySize(baseCount); i++)
            baseCount[i] = 0;
        for (i = 0; i < ArraySize(dinucleotideCount); i++)
            for (j = 0; j < ArraySize(dinucleotideCount[i]); j++)
                dinucleotideCount[i][j] = 0;
    	for (j=0; j<seq.size; ++j)
	        {
            int baseVal = ntVal5[(int)(seq.dna[j])];
            int rcBaseVal;
            assert(baseVal != -1);
            assert(baseVal <= 4);
            length++;
            switch(baseVal)
                {
                case A_BASE_VAL: rcBaseVal = T_BASE_VAL; break;
                case C_BASE_VAL: rcBaseVal = G_BASE_VAL; break;
                case G_BASE_VAL: rcBaseVal = C_BASE_VAL; break;
                case T_BASE_VAL: rcBaseVal = A_BASE_VAL; break;
                default: rcBaseVal = N_BASE_VAL; break;
                }
            baseCount[baseVal]++;
            if ((prevBase == C_BASE_VAL) && (baseVal == G_BASE_VAL))
                cpgCount++;
            if (prevBase != -1)
                dinucleotideCount[prevBase][baseVal]++;
            if (strands)
                {
                length++;
                baseCount[rcBaseVal]++;
                if ((prevRcBase == G_BASE_VAL) && (rcBaseVal == C_BASE_VAL))
                    cpgCount++;
                if (prevRcBase != -1)
                    dinucleotideCount[rcBaseVal][prevRcBase]++;
                }
            prevBase = baseVal;
            prevRcBase = rcBaseVal;
            }
        if (!summary)
            {
            printf("%s\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu",
            seq.name, length,
            baseCount[A_BASE_VAL], baseCount[C_BASE_VAL],
            baseCount[G_BASE_VAL], baseCount[T_BASE_VAL],
            baseCount[N_BASE_VAL], cpgCount);
            if (dinuc)
                printf("\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu",
                    dinucleotideCount[A_BASE_VAL][A_BASE_VAL], dinucleotideCount[A_BASE_VAL][C_BASE_VAL],
                    dinucleotideCount[A_BASE_VAL][G_BASE_VAL], dinucleotideCount[A_BASE_VAL][T_BASE_VAL],
                    dinucleotideCount[C_BASE_VAL][A_BASE_VAL], dinucleotideCount[C_BASE_VAL][C_BASE_VAL],
                    dinucleotideCount[C_BASE_VAL][G_BASE_VAL], dinucleotideCount[C_BASE_VAL][T_BASE_VAL],
                    dinucleotideCount[G_BASE_VAL][A_BASE_VAL], dinucleotideCount[G_BASE_VAL][C_BASE_VAL],
                    dinucleotideCount[G_BASE_VAL][G_BASE_VAL], dinucleotideCount[G_BASE_VAL][T_BASE_VAL],
                    dinucleotideCount[T_BASE_VAL][A_BASE_VAL], dinucleotideCount[T_BASE_VAL][C_BASE_VAL],
                    dinucleotideCount[T_BASE_VAL][G_BASE_VAL], dinucleotideCount[T_BASE_VAL][T_BASE_VAL]);
            printf("\n");
            }
        totalLength += length;
        totalCpgCount += cpgCount;
        for (i = 0; i < ArraySize(baseCount); i++)
            totalBaseCount[i] += baseCount[i];
        for (i = 0; i < ArraySize(dinucleotideCount); i++)
            for (k = 0; k < ArraySize(dinucleotideCount[i]); k++)
                totalDinucleotideCount[i][k] += dinucleotideCount[i][k];
        }
    lineFileClose(&lf);
	}


printf("total\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu",
       totalLength,
       totalBaseCount[A_BASE_VAL], totalBaseCount[C_BASE_VAL],
       totalBaseCount[G_BASE_VAL], totalBaseCount[T_BASE_VAL],
       totalBaseCount[N_BASE_VAL], totalCpgCount);
if (dinuc)
    printf("\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu\t%llu",
        totalDinucleotideCount[A_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[A_BASE_VAL][C_BASE_VAL],
        totalDinucleotideCount[A_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[A_BASE_VAL][T_BASE_VAL],
        totalDinucleotideCount[C_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[C_BASE_VAL][C_BASE_VAL],
        totalDinucleotideCount[C_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[C_BASE_VAL][T_BASE_VAL],
        totalDinucleotideCount[G_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[G_BASE_VAL][C_BASE_VAL],
        totalDinucleotideCount[G_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[G_BASE_VAL][T_BASE_VAL],
        totalDinucleotideCount[T_BASE_VAL][A_BASE_VAL], totalDinucleotideCount[T_BASE_VAL][C_BASE_VAL],
        totalDinucleotideCount[T_BASE_VAL][G_BASE_VAL], totalDinucleotideCount[T_BASE_VAL][T_BASE_VAL]);
printf("\n");

if (summary)
    {
    printf("prcnt\t%-5.1f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f",
       (float)totalLength/totalLength,
       ((float)totalBaseCount[A_BASE_VAL])/(float)totalLength, ((float)totalBaseCount[C_BASE_VAL])/(float)totalLength,
       ((float)totalBaseCount[G_BASE_VAL])/(float)totalLength, ((float)totalBaseCount[T_BASE_VAL])/(float)totalLength,
       ((float)totalBaseCount[N_BASE_VAL])/(float)totalLength, (float)totalCpgCount/(float)totalLength);
    if (dinuc)
        printf("\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f\t%-5.4f",
            (float)totalDinucleotideCount[A_BASE_VAL][A_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[A_BASE_VAL][C_BASE_VAL]/(float)totalLength,
            (float)totalDinucleotideCount[A_BASE_VAL][G_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[A_BASE_VAL][T_BASE_VAL]/(float)totalLength,
            (float)totalDinucleotideCount[C_BASE_VAL][A_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[C_BASE_VAL][C_BASE_VAL]/(float)totalLength,
            (float)totalDinucleotideCount[C_BASE_VAL][G_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[C_BASE_VAL][T_BASE_VAL]/(float)totalLength,
            (float)totalDinucleotideCount[G_BASE_VAL][A_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[G_BASE_VAL][C_BASE_VAL]/(float)totalLength,
            (float)totalDinucleotideCount[G_BASE_VAL][G_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[G_BASE_VAL][T_BASE_VAL]/(float)totalLength,
            (float)totalDinucleotideCount[T_BASE_VAL][A_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[T_BASE_VAL][C_BASE_VAL]/(float)totalLength,
            (float)totalDinucleotideCount[T_BASE_VAL][G_BASE_VAL]/(float)totalLength, (float)totalDinucleotideCount[T_BASE_VAL][T_BASE_VAL]/(float)totalLength);
    printf("\n");
    }
}