Esempio n. 1
0
void getChromSizes(char *database, struct hash **retHash, 
	struct chromSizes **retList)
/* Return hash of chromSizes.  Also calculates size without
 * gaps. */
{
struct sqlConnection *conn = hAllocConn(database);
struct chromInfo *ci, *ciList = getAllChromInfo(database);
struct sqlResult *sr;
char **row;
struct chromSizes *cs, *csList = NULL;
struct hash *hash = newHash(8);
int rowOffset;

for (ci = ciList; ci != NULL; ci = ci->next)
    {
    AllocVar(cs);
    hashAddSaveName(hash, ci->chrom, cs, &cs->name);
    slAddHead(&csList, cs);
    cs->totalSize = ci->size;
    sr = hChromQuery(conn, "gold", ci->chrom, NULL, &rowOffset);
    while ((row = sqlNextRow(sr)) != NULL)
        {
	struct agpFrag frag;
	agpFragStaticLoad(row + rowOffset, &frag);
	cs->seqSize += frag.chromEnd - frag.chromStart;
	}
    sqlFreeResult(&sr);
    }
hFreeConn(&conn);
slReverse(&csList);
*retHash = hash;
*retList = csList;
}
void readPatch(char *fileName, struct hash *cloneHash, 
	struct ntContig **retNtList, struct hash **retNtHash)
/* Read nt.agp file into clone/hash.  */
{
struct ntContig *ntList = NULL, *nt = NULL;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[9];
struct agpFrag frag;
struct clone *clone, *ntClone, *lastClone = NULL;
struct cloneRef *ref;
struct hash *ntHash = newHash(0);
char cloneName[128];
char fragName[128];
char c;
int ntOrder = 0;

while (lineFileRow(lf, row))
    {
    agpFragStaticLoad(row, &frag);
    // file is 1-based but agpFragLoad() now assumes 0-based:
    frag.chromStart -= 1;
    frag.fragStart  -= 1;
    if (nt == NULL || !sameString(frag.chrom, nt->name))
        {
	AllocVar(nt);
	slAddHead(&ntList, nt);
	if (hashLookup(ntHash, frag.chrom) != NULL)
	    errAbort("NT contig %s repeated line %d of %s", row[0], lf->lineIx, lf->fileName);
	hashAddSaveName(ntHash, frag.chrom, nt, &nt->name);
	lastClone = NULL;
	ntOrder = 0;
	}
    strcpy(cloneName, frag.frag);
    chopSuffix(cloneName);
    clone = hashMustFindVal(cloneHash, cloneName);
    clone->ntStart = frag.chromStart;
    clone->ntEnd = frag.chromEnd;
    if (clone->nt != NULL)
	{
        warn("Clone %s trying to be in two NT contigs (%s and %s) line %d of %s",
		clone->name, clone->nt->name, nt->name, lf->lineIx, lf->fileName);
	nt->problem = TRUE;
	}
    clone->nt = nt;
    c = frag.strand[0];
    if (c == '-')
	clone->ntOrientation = -1;
    else if (c == '+')
	clone->ntOrientation = +1;
    else
	errAbort("Expecting +1 or -1 field 5, line %d, file %s", lf->lineIx, lf->fileName);
    c = frag.type[0];
    if (c == 'F' || c == 'D' || c == 'P')
	clone->seqType =  c;
    else
	errAbort("Expecting F, D, or P  field 6, line %d, file %s", lf->lineIx, lf->fileName);
    sprintf(fragName, "%s_1", frag.frag);
    clone->fragName = cloneString(fragName);
    clone->goldStart = frag.fragStart;
    clone->goldEnd = frag.fragEnd;
    clone->ntOrder = ntOrder++;

    /* Add ref to NT. */
    AllocVar(ref);
    ref->ref = clone;
    slAddTail(&nt->cloneList, ref);

    /* Do a few tests. */
    if (clone->goldStart >= clone->goldEnd)
	{
	warn("Clone %s end before start (%d before %d) line %d of %s", 
		clone->name, clone->goldStart, clone->goldEnd, lf->lineIx, lf->fileName);
	nt->problem = TRUE;
	}
    if (clone->ntStart >= clone->ntEnd)
	{
	warn("Clone %s NT end before NT start line %d of %s", 
		clone->name, lf->lineIx, lf->fileName);
	nt->problem = TRUE;
	}
    if (clone->goldEnd > clone->size)
	{
	if (sameString(clone->startFrag, clone->endFrag))
	    {
	    warn("Clone %s end position %d, clone size %d, line %d of %s", 
		clone->name, clone->goldEnd, clone->size, lf->lineIx, lf->fileName);
	    nt->problem = TRUE;
	    }
	}
    if (clone->ntEnd - clone->ntStart != clone->goldEnd - clone->goldStart)
        {
	warn("Size not the same in NT contig as in clone %s (%d vs %d) line %d of %s",
		clone->name,
		clone->ntEnd - clone->ntStart, clone->goldEnd-clone->goldStart,
		lf->lineIx, lf->fileName);
	nt->problem = TRUE;
	}
    nt->sumSize += clone->goldEnd - clone->goldStart;
    ntClone = hashFindVal(cloneHash, nt->name);
    if (ntClone != NULL && clone->ntEnd > ntClone->size)
	{
	warn("Clone %s NT end position %d, NT size %d, line %d of %s", 
	    clone->name, clone->ntEnd, ntClone->size, lf->lineIx, lf->fileName);
	nt->problem = TRUE;
	}
    if (ntClone != NULL)
	nt->size = ntClone->size;
    else
        nt->size = clone->size;		/* This happens for single-clone NT contigs only. */
    if (lastClone != NULL)
        {
	if (lastClone->ntEnd != clone->ntStart)
	    {
	    warn("last clone (%s)'s end doesn't match with current clone (%s)'s start line %d of %s",
	    	lastClone->name, clone->name, lf->lineIx, lf->fileName);
	    }
	}
    lastClone = clone;
    }

lineFileClose(&lf);
slReverse(&ntList);

for (nt = ntList; nt != NULL; nt = nt->next)
    {
    if (nt->sumSize != nt->size)
        {
	warn("Sum of fragments of %s is %d, but size is supposed to be %d",
		nt->name, nt->sumSize, nt->size);
	nt->problem = TRUE;
	}
    }
*retNtList = ntList;
*retNtHash = ntHash;
}
void splitAgp(char *agpName, char *goldFileName, char *gapFileName)
/* Split up agp file into gold and gap files. */
{
    struct lineFile *lf;
    char *words[16];
    int wordCount;
    FILE *goldTab, *gapTab;

    /* Scan through .agp file splitting it into gold
     * and gap components. */
    goldTab = mustOpen(goldFileName, "w");
    gapTab = mustOpen(gapFileName, "w");
    lf = lineFileOpen(agpName, TRUE);
    while ((wordCount = lineFileChop(lf, words)) > 0)
    {
        int start, end;
        if (wordCount < 5)
            errAbort("Short line %d of %s", lf->lineIx, lf->fileName);
        int len = strlen(words[0]);
        if (len > maxChromNameSize)
        {
            maxChromNameSize = len;
            if (maxChromNameSize > 254)
                errAbort("ERROR: chrom name size is over 254(%d) characters: "
                         "'%s'", maxChromNameSize, words[0]);
        }

        start = sqlUnsigned(words[1])-1;
        end = sqlUnsigned(words[2]);
        if (words[4][0] == 'N' || words[4][0] == 'U')
        {
            struct agpGap gap;
            agpGapStaticLoad(words, &gap);
            gap.chromStart -= 1;
            fprintf(gapTab, "%u\t", hFindBin(start, end));
            agpGapTabOut(&gap, gapTab);
            verbose(3,"#GAP\t%s:%d-%d\n", gap.chrom, gap.chromStart, gap.chromEnd);
        }
        else
        {
            struct agpFrag gold;
            agpFragStaticLoad(words, &gold);
            agpFragValidate(&gold);
            len = strlen(words[5]);
            if (len > maxFragNameSize)
            {
                maxFragNameSize = len;
                if (maxFragNameSize > 254)
                    errAbort("ERROR: fragment name size is over 254(%d) "
                             "characters: '%s'", maxFragNameSize, words[5]);
            }
            // file is 1-based. agpFragLoad() now assumes 0-based.
            // and agpFragTabOut() will assume 1-based, but we will load
            // the generated file straight into the database, so
            // subtract 2:
            gold.chromStart -= 2;
            gold.fragStart  -= 2;
            fprintf(goldTab, "%u\t", hFindBin(start, end));
            agpFragTabOut(&gold, goldTab);
        }
    }
    lineFileClose(&lf);
    carefulClose(&goldTab);
    carefulClose(&gapTab);

}