Beispiel #1
0
int putInNameTable(struct hash *hash, FILE *f, char *name)
/* Add to name table if it isn't there already. 
 * Return ID of name in table. */
{
struct hashEl *hel;
if (name == NULL)
    return 0;
if ((hel = hashLookup(hash, name)) != NULL)
    return ptToInt(hel->val);
else
    {
    // It appears like this program is dead code that is no longer used.
    // I don't want to make a potential bogus change to track the removal
    // of hgNextId(), so a landmine is added.  Code can be change to determine
    // max id from table if ever needed.  markd 2010-12-15
#if 1
    errAbort("code hasn't been updated to work, please see markd");
    return 0;
#else
    int id = hgNextId();
    fprintf(f, "%u\t%s\n", id, name);
    hashAdd(hash, name, intToPt(id));
    return id;
#endif
    }
}
struct hash *optionalFieldsHash()
/* Return hash of terms we have tables for. */
{
struct hash *hash = hashNew(8);
int i;
for (i=0; i<ArraySize(expOptionalFields); ++i)
    hashAdd(hash, expOptionalFields[i], intToPt(i));
return hash;
}
void addCountsToHash(struct hash *countHash, struct bed *bedList)
/* Add up the ones we find in the bed to the count hash. */
{
struct bed *bed;
for (bed = bedList; bed != NULL; bed = bed->next)
    {
    struct hashEl *el = hashLookup(countHash, bed->name);
    int newCount = ptToInt(el->val) + 1;
    el->val = intToPt(newCount);
    }
}
void addFreqToHash(struct hash *freqHash, char *tag, char *id, int val)
/* Add the frequency to the end of the list in the hash. */
/* This is done at each line in the frequencies file. */
{ 
struct hashEl *el = hashStore(freqHash, tag);
struct slPair *newOne;
struct slPair **pList = (struct slPair **)&el->val;
AllocVar(newOne);
newOne->name = cloneString(id);
newOne->val = intToPt(val);
slAddHead(pList, newOne);
}
Beispiel #5
0
static void handleNoMapping(struct spMapper *sm, char *id, char reason)
/* record id that can't be mapped */
{
struct hashEl *hel = hashStore(sm->noMapTbl, id);
if (hel->val == NULL)
    {
    if (reason == noUnirefMapping)
        sm->noUnirefMapCnt++;
    else if (reason == noKGMapping)
        sm->noSpIdMapCnt++;
    }
hel->val = intToPt(reason);
}
struct hash *hashRowOffsets(char *line)
/* Given a space-delimited line, create a hash keyed by the words in 
 * line with values the position of the word (0 based) in line */
{
struct hash *hash = hashNew(0);
char *word;
int wordIx = 0;
while ((word = nextWord(&line)) != 0)
    {
    hashAdd(hash, word, intToPt(wordIx));
    wordIx += 1;
    }
return hash;
}
Beispiel #7
0
void loadCoverQSizes(char* coverQSizeFile)
/* load coverage query sizes */
{
struct lineFile *lf = lineFileOpen(coverQSizeFile, TRUE);
char *row[2];
coverQSizes = hashNew(0);

while (lineFileNextRowTab(lf, row, ArraySize(row)))
    {
    int qSize = sqlSigned(row[1]);
    hashAdd(coverQSizes, row[0], intToPt(qSize));
    }

lineFileClose(&lf);
}
static void addSize(char *name, unsigned size, struct hash *sizes)
/* add a name and size */
{
struct hashEl *hel = hashLookup(sizes, name);
if (hel != NULL)
    {
    /* size must be the same if duplicated */
    if (ptToInt(hel->val) != size)
        errAbort("sequence %s already specified as size %d, can't set to %d",
                 name, ptToInt(hel->val), size);
    }
else
    hashAdd(sizes, name, intToPt(size));

}
Beispiel #9
0
struct hash *loadNameTable(struct sqlConnection *conn, 
    char *tableName, int hashSize)
/* Create a hash and load it up from table. */
{
char query[128];
struct sqlResult *sr;
char **row;
struct hash *hash = newHash(hashSize);

sqlSafef(query, sizeof query, "select id,name from %s", tableName);
sr = sqlGetResult(conn, query);
while ((row = sqlNextRow(sr)) != NULL)
    {
    hashAdd(hash, row[1], intToPt(sqlUnsigned(row[0])));
    }
sqlFreeResult(&sr);
return hash;
}
Beispiel #10
0
struct hash *readSizes(char *fileName)
/* Read tab-separated file into hash with
 * name key size value. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct hash *hash = newHash(0);
char *row[2];
while (lineFileRow(lf, row))
    {
    char *name = row[0];
    int size = lineFileNeedNum(lf, row, 1);
    
    /* trust the user to not have duplicated names in the lengths file */
    hashAdd(hash, name, intToPt(size));
    }
lineFileClose(&lf);
return hash;
}
Beispiel #11
0
static char *findUniqueName(struct hash *dupeHash, char *root)
/* If root name is already in hash, return root_1, root_2
 * or something like that. */
{
struct hashEl *hel;
if ((hel = hashLookup(dupeHash, root)) == NULL)
    {
    hashAddInt(dupeHash, root, 1);
    return root;
    }
else
    {
    static char buf[256];
    int val = ptToInt(hel->val) + 1;
    hel->val = intToPt(val);
    safef(buf, sizeof(buf), "%s_%d", root, val);
    return buf;
    }
}
struct hash *readSizes(char *fileName)
/* Read tab-separated file into hash with
 * name key size value. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
struct hash *hash = newHash(0);
char *row[2];
while (lineFileRow(lf, row))
    {
    char *name = row[0];
    int size = lineFileNeedNum(lf, row, 1);
    if (hashLookup(hash, name) != NULL)
        warn("Duplicate %s, ignoring all but first\n", name);
    else
	hashAdd(hash, name, intToPt(size));
    }
lineFileClose(&lf);
return hash;
}
void consolidateTheCounts(char *inputFile, char *outputFile)
/* Read the cat'ed file in, and either make a new hash item for each enzyme */
/* encountered on each line, or add to an existing one. Then output the hash. */
{
struct lineFile *lf = lineFileOpen(inputFile, TRUE);
struct hash *countHash = newHash(12);
char *words[2];
while (lineFileRow(lf, words))
    {
    char *name = words[0];
    int count = lineFileNeedFullNum(lf, words, 1);
    struct hashEl *el = hashLookup(countHash, name);
    if (!el)
	hashAddInt(countHash, name, count);
    else
	el->val = intToPt(ptToInt(el->val) + count);
    }
writeHashToFile(countHash, outputFile);
freeHash(&countHash);
}
Beispiel #14
0
int oneHubTrackSettings(char *hubUrl, struct hash *totals)
/* Read hub trackDb files, noting settings used */
{
struct trackHub *hub = NULL;
struct errCatch *errCatch = errCatchNew();
if (errCatchStart(errCatch))
    hub = trackHubOpen(hubUrl, "hub_0");
errCatchEnd(errCatch);
errCatchFree(&errCatch);

if (hub == NULL)
    return 1;

printf("%s (%s)\n", hubUrl, hub->shortLabel);
struct trackHubGenome *genome;
struct hash *counts;
if (totals)
    counts = totals;
else
    counts = newHash(0);
struct hashEl *el;
for (genome = hub->genomeList; genome != NULL; genome = genome->next)
    {
    struct trackDb *tdb, *tdbs = trackHubTracksForGenome(hub, genome);
    for (tdb = tdbs; tdb != NULL; tdb = tdb->next)
        {
        struct hashCookie cookie = hashFirst(trackDbHashSettings(tdb));
        verbose(2, "    track: %s\n", tdb->shortLabel);
        while ((el = hashNext(&cookie)) != NULL)
            {
            int count = hashIntValDefault(counts, el->name, 0);
            count++;
            hashReplace(counts, el->name, intToPt(count));
            }
        }
    }
if (!totals)
    printCounts(counts);
trackHubClose(&hub);
return 0;
}
struct hash *posHashFromTable(struct sqlConnection *conn, char *table)
/* Store name, chrom, chromStart in a hash of slPairs. (cheap) */
{
    struct hash *posHash = newHash(24);
    struct sqlResult *sr = NULL;
    char query[256];
    char **row;
    sqlSafef(query, sizeof(query), "select name,chrom,chromStart from %s", table);
    sr = sqlGetResult(conn, query);
    while ((row = sqlNextRow(sr)) != NULL)
    {
        int chromStart = (int)sqlUnsigned(row[2]);
        struct slPair *pair;
        AllocVar(pair);
        pair->name = cloneString(row[1]);
        pair->val = intToPt(chromStart);
        hashAdd(posHash, row[0], pair);
    }
    sqlFreeResult(&sr);
    return posHash;
}
void bioImageLoad(char *setRaFile, char *itemTabFile)
/* bioImageLoad - Load data into bioImage database. */
{
struct hash *raHash = raReadSingle(setRaFile);
struct hash *rowHash;
struct lineFile *lf = lineFileOpen(itemTabFile, TRUE);
char *line, *words[256];
struct sqlConnection *conn = sqlConnect(database);
int rowSize;
int submissionSetId;
struct hash *fullDirHash = newHash(0);
struct hash *screenDirHash = newHash(0);
struct hash *thumbDirHash = newHash(0);
struct hash *treatmentHash = newHash(0);
struct hash *bodyPartHash = newHash(0);
struct hash *sliceTypeHash = newHash(0);
struct hash *imageTypeHash = newHash(0);
struct hash *sectionSetHash = newHash(0);
struct dyString *dy = dyStringNew(0);

/* Read first line of tab file, and from it get all the field names. */
if (!lineFileNext(lf, &line, NULL))
    errAbort("%s appears to be empty", lf->fileName);
if (line[0] != '#')
    errAbort("First line of %s needs to start with #, and then contain field names",
    	lf->fileName);
rowHash = hashRowOffsets(line+1);
rowSize = rowHash->elCount;
if (rowSize >= ArraySize(words))
    errAbort("Too many fields in %s", lf->fileName);

/* Check that have all required fields */
    {
    char *fieldName;
    int i;

    for (i=0; i<ArraySize(requiredSetFields); ++i)
        {
	fieldName = requiredSetFields[i];
	if (!hashLookup(raHash, fieldName))
	    errAbort("Field %s is not in %s", fieldName, setRaFile);
	}

    for (i=0; i<ArraySize(requiredItemFields); ++i)
        {
	fieldName = requiredItemFields[i];
	if (!hashLookup(rowHash, fieldName))
	    errAbort("Field %s is not in %s", fieldName, itemTabFile);
	}

    for (i=0; i<ArraySize(requiredFields); ++i)
        {
	fieldName = requiredFields[i];
	if (!hashLookup(rowHash, fieldName) && !hashLookup(raHash, fieldName))
	    errAbort("Field %s is not in %s or %s", fieldName, setRaFile, itemTabFile);
	}
    }

/* Create/find submission record. */
submissionSetId = saveSubmissionSet(conn, raHash);

/* Process rest of tab file. */
while (lineFileNextRowTab(lf, words, rowSize))
    {
    int fullDir = cachedId(conn, "location", "name", 
    	fullDirHash, "fullDir", raHash, rowHash, words);
    int screenDir = cachedId(conn, "location", "name", 
    	screenDirHash, "screenDir", raHash, rowHash, words);
    int thumbDir = cachedId(conn, "location", 
    	"name", thumbDirHash, "thumbDir", raHash, rowHash, words);
    int bodyPart = cachedId(conn, "bodyPart", 
    	"name", bodyPartHash, "bodyPart", raHash, rowHash, words);
    int sliceType = cachedId(conn, "sliceType", 
    	"name", sliceTypeHash, "sliceType", raHash, rowHash, words);
    int imageType = cachedId(conn, "imageType", 
    	"name", imageTypeHash, "imageType", raHash, rowHash, words);
    int treatment = cachedId(conn, "treatment", 
    	"conditions", treatmentHash, "treatment", raHash, rowHash, words);
    char *fileName = getVal("fileName", raHash, rowHash, words, NULL);
    char *submitId = getVal("submitId", raHash, rowHash, words, NULL);
    char *taxon = getVal("taxon", raHash, rowHash, words, NULL);
    char *isEmbryo = getVal("isEmbryo", raHash, rowHash, words, NULL);
    char *age = getVal("age", raHash, rowHash, words, NULL);
    char *sectionSet = getVal("sectionSet", raHash, rowHash, words, "");
    char *sectionIx = getVal("sectionIx", raHash, rowHash, words, "0");
    char *gene = getVal("gene", raHash, rowHash, words, "");
    char *locusLink = getVal("locusLink", raHash, rowHash, words, "");
    char *refSeq = getVal("refSeq", raHash, rowHash, words, "");
    char *genbank = getVal("genbank", raHash, rowHash, words, "");
    char *priority = getVal("priority", raHash, rowHash, words, "200");
    int sectionId = 0;
    int oldId;
    // char *xzy = getVal("xzy", raHash, rowHash, words, xzy);

    if (sectionSet[0] != 0 && !sameString(sectionSet, "0"))
        {
	struct hashEl *hel = hashLookup(sectionSetHash, sectionSet);
	if (hel != NULL)
	    sectionId = ptToInt(hel->val);
	else
	    {
	    sqlUpdate(conn, "insert into sectionSet values(default)");
	    sectionId = sqlLastAutoId(conn);
	    hashAdd(sectionSetHash, sectionSet, intToPt(sectionId));
	    }
	}

    dyStringClear(dy);
    dyStringAppend(dy, "select id from image ");
    dyStringPrintf(dy, "where fileName = '%s' ", fileName);
    dyStringPrintf(dy, "and fullLocation = %d",  fullDir);
    oldId = sqlQuickNum(conn, dy->string);
    if (oldId != 0)
        {
	if (replace)
	    {
	    dyStringClear(dy);
	    dyStringPrintf(dy, "delete from image where id = %d", oldId);
	    sqlUpdate(conn, dy->string);
	    }
	else
	    errAbort("%s is already in database line %d of %s", 
	    	fileName, lf->lineIx, lf->fileName);
	}

    dyStringClear(dy);
    dyStringAppend(dy, "insert into image set\n");
    dyStringPrintf(dy, " id = default,\n");
    dyStringPrintf(dy, " fileName = '%s',\n", fileName);
    dyStringPrintf(dy, " fullLocation = %d,\n", fullDir);
    dyStringPrintf(dy, " screenLocation = %d,\n", screenDir);
    dyStringPrintf(dy, " thumbLocation = %d,\n", thumbDir);
    dyStringPrintf(dy, " submissionSet = %d,\n", submissionSetId);
    dyStringPrintf(dy, " sectionSet = %d,\n", sectionId);
    dyStringPrintf(dy, " sectionIx = %s,\n", sectionIx);
    dyStringPrintf(dy, " submitId = '%s',\n", submitId);
    dyStringPrintf(dy, " gene = '%s',\n", gene);
    dyStringPrintf(dy, " locusLink = '%s',\n", locusLink);
    dyStringPrintf(dy, " refSeq = '%s',\n", refSeq);
    dyStringPrintf(dy, " genbank = '%s',\n", genbank);
    dyStringPrintf(dy, " priority = %s,\n", priority);
    dyStringPrintf(dy, " taxon = %s,\n", taxon);
    dyStringPrintf(dy, " isEmbryo = %s,\n", isEmbryo);
    dyStringPrintf(dy, " age = %s,\n", age);
    dyStringPrintf(dy, " bodyPart = %d,\n", bodyPart);
    dyStringPrintf(dy, " sliceType = %d,\n", sliceType);
    dyStringPrintf(dy, " imageType = %d,\n", imageType);
    dyStringPrintf(dy, " treatment = %d\n", treatment);

    sqlUpdate(conn, dy->string);
    }
}
Beispiel #17
0
void processRefSeq(char *database, char *faFile, char *raFile, char *pslFile, char *loc2refFile, 
	char *pepFile, char *mim2locFile)
/* hgRefSeqMrna - Load refSeq mRNA alignments and other info into 
 * refSeqGene table. */
{
struct lineFile *lf;
struct hash *raHash, *rsiHash = newHash(0);
struct hash *loc2mimHash = newHash(0);
struct refSeqInfo *rsiList = NULL, *rsi;
char *s, *line, *row[5];
int wordCount, dotMod = 0;
int noLocCount = 0;
int rsiCount = 0;
int noProtCount = 0;
struct psl *psl;
struct sqlConnection *conn = hgStartUpdate(database);
struct hash *productHash = loadNameTable(conn, "productName", 16);
struct hash *geneHash = loadNameTable(conn, "geneName", 16);
char *kgName = "refGene";

FILE *kgTab = hgCreateTabFile(".", kgName);
FILE *productTab = hgCreateTabFile(".", "productName");
FILE *geneTab = hgCreateTabFile(".", "geneName");
FILE *refLinkTab = hgCreateTabFile(".", "refLink");
FILE *refPepTab = hgCreateTabFile(".", "refPep");
FILE *refMrnaTab = hgCreateTabFile(".", "refMrna");

struct exon *exonList = NULL, *exon;
char *answer;
char cond_str[200];

/* Make refLink and other tables table if they don't exist already. */
sqlMaybeMakeTable(conn, "refLink", refLinkTableDef);
sqlUpdate(conn, "NOSQLINJ delete from refLink");
sqlMaybeMakeTable(conn, "refGene", refGeneTableDef);
sqlUpdate(conn, "NOSQLINJ delete from refGene");
sqlMaybeMakeTable(conn, "refPep", refPepTableDef);
sqlUpdate(conn, "NOSQLINJ delete from refPep");
sqlMaybeMakeTable(conn, "refMrna", refMrnaTableDef);
sqlUpdate(conn, "NOSQLINJ delete from refMrna");

/* Scan through locus link to omim ID file and put in hash. */
    {
    char *row[2];

    printf("Scanning %s\n", mim2locFile);
    lf = lineFileOpen(mim2locFile, TRUE);
    while (lineFileRow(lf, row))
	{
	hashAdd(loc2mimHash, row[1], intToPt(atoi(row[0])));
	}
    lineFileClose(&lf);
    }

/* Scan through .ra file and make up start of refSeqInfo
 * objects in hash and list. */
printf("Scanning %s\n", raFile);
lf = lineFileOpen(raFile, TRUE);
while ((raHash = hashNextRa(lf)) != NULL)
    {
    if (clDots > 0 && ++dotMod == clDots )
        {
	dotMod = 0;
	dotOut();
	}
    AllocVar(rsi);
    slAddHead(&rsiList, rsi);
    if ((s = hashFindVal(raHash, "acc")) == NULL)
        errAbort("No acc near line %d of %s", lf->lineIx, lf->fileName);
    rsi->mrnaAcc = cloneString(s);
    if ((s = hashFindVal(raHash, "siz")) == NULL)
        errAbort("No siz near line %d of %s", lf->lineIx, lf->fileName);
    rsi->size = atoi(s);
    if ((s = hashFindVal(raHash, "gen")) != NULL)
	rsi->geneName = cloneString(s);
    //!!!else
      //!!!  warn("No gene name for %s", rsi->mrnaAcc);
    if ((s = hashFindVal(raHash, "cds")) != NULL)
        parseCds(s, 0, rsi->size, &rsi->cdsStart, &rsi->cdsEnd);
    else
        rsi->cdsEnd = rsi->size;
    if ((s = hashFindVal(raHash, "ngi")) != NULL)
        rsi->ngi = atoi(s);

    rsi->geneNameId = putInNameTable(geneHash, geneTab, rsi->geneName);
    s = hashFindVal(raHash, "pro");
    if (s != NULL)
        rsi->productName = cloneString(s);
    rsi->productNameId = putInNameTable(productHash, productTab, s);
    hashAdd(rsiHash, rsi->mrnaAcc, rsi);

    freeHashAndVals(&raHash);
    }
lineFileClose(&lf);
if (clDots) printf("\n");

/* Scan through loc2ref filling in some gaps in rsi. */
printf("Scanning %s\n", loc2refFile);
lf = lineFileOpen(loc2refFile, TRUE);
while (lineFileNext(lf, &line, NULL))
    {
    char *mrnaAcc;

    if (line[0] == '#')
        continue;
    wordCount = chopTabs(line, row);
    if (wordCount < 5)
        errAbort("Expecting at least 5 tab-separated words line %d of %s",
		lf->lineIx, lf->fileName);
    mrnaAcc = row[1];
    mrnaAcc = accWithoutSuffix(mrnaAcc);

    if (mrnaAcc[2] != '_')
        warn("%s is and odd name %d of %s", 
		mrnaAcc, lf->lineIx, lf->fileName);
    if ((rsi = hashFindVal(rsiHash, mrnaAcc)) != NULL)
        {
	rsi->locusLinkId = lineFileNeedNum(lf, row, 0);
	rsi->omimId = ptToInt(hashFindVal(loc2mimHash, row[0]));
	rsi->proteinAcc = cloneString(accWithoutSuffix(row[4]));
	}
    }
lineFileClose(&lf);

/* Report how many seem to be missing from loc2ref file. 
 * Write out knownInfo file. */
printf("Writing %s\n", "refLink.tab");
for (rsi = rsiList; rsi != NULL; rsi = rsi->next)
    {
    ++rsiCount;
    if (rsi->locusLinkId == 0)
        ++noLocCount;
    if (rsi->proteinAcc == NULL)
        ++noProtCount;
    fprintf(refLinkTab, "%s\t%s\t%s\t%s\t%u\t%u\t%u\t%u\n",
	emptyForNull(rsi->geneName), 
	emptyForNull(rsi->productName),
    	emptyForNull(rsi->mrnaAcc), 
	emptyForNull(rsi->proteinAcc),
	rsi->geneNameId, rsi->productNameId, 
	rsi->locusLinkId, rsi->omimId);
    }
if (noLocCount) 
    printf("Missing locusLinkIds for %d of %d\n", noLocCount, rsiCount);
if (noProtCount)
    printf("Missing protein accessions for %d of %d\n", noProtCount, rsiCount);

/* Process alignments and write them out as genes. */
lf = pslFileOpen(pslFile);
dotMod = 0;
while ((psl = pslNext(lf)) != NULL)
  {
  if (hashFindVal(rsiHash, psl->qName) != NULL)
    {
    if (clDots > 0 && ++dotMod == clDots )
        {
	dotMod = 0;
	dotOut();
	}
   
    sqlSafefFrag(cond_str, sizeof cond_str, "extAC='%s'", psl->qName);
    answer = sqlGetField(proteinDB, "spXref2", "displayID", cond_str);
	       
    if (answer == NULL)
	{
	fprintf(stderr, "%s NOT FOUND.\n", psl->qName);
   	fflush(stderr);
	}

    if (answer != NULL)
    	{	
        struct genePred *gp = NULL;
    	exonList = pslToExonList(psl);
    	fprintf(kgTab, "%s\t%s\t%c\t%d\t%d\t",
	psl->qName, psl->tName, psl->strand[0], psl->tStart, psl->tEnd);
    	rsi = hashMustFindVal(rsiHash, psl->qName);

        gp = genePredFromPsl(psl, rsi->cdsStart, rsi->cdsEnd, genePredStdInsertMergeSize);
        if (!gp)
            errAbort("Cannot convert psl (%s) to genePred.\n", psl->qName);

    	fprintf(kgTab, "%d\t%d\t", gp->cdsStart, gp->cdsEnd);
    	fprintf(kgTab, "%d\t", slCount(exonList));
    
    	fflush(kgTab);
     
    	for (exon = exonList; exon != NULL; exon = exon->next)
        fprintf(kgTab, "%d,", exon->start);
    	fprintf(kgTab, "\t");
    
        for (exon = exonList; exon != NULL; exon = exon->next)
        	fprintf(kgTab, "%d,", exon->end);
    	fprintf(kgTab, "\n");
    	slFreeList(&exonList);
    	}
    }
  else
    {
    fprintf(stderr, "%s found in psl, but not in .fa or .ra data files.\n", psl->qName);
    fflush(stderr);
    }
  }

if (clDots) printf("\n");

if (!clTest)
    {
    writeSeqTable(pepFile, refPepTab, FALSE, TRUE);
    writeSeqTable(faFile, refMrnaTab, FALSE, FALSE);
    }

carefulClose(&kgTab);
carefulClose(&productTab);
carefulClose(&geneTab);
carefulClose(&refLinkTab);
carefulClose(&refPepTab);
carefulClose(&refMrnaTab);

if (!clTest)
    {
    printf("Loading database with %s\n", kgName);
    fflush(stdout);
    
    hgLoadTabFile(conn, ".", kgName, NULL);

    printf("Loading database with %s\n", "productName");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "productName", NULL);
    
    printf("Loading database with %s\n", "geneName");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "geneName", NULL);
    
    printf("Loading database with %s\n", "refLink");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "refLink", NULL);
    
    printf("Loading database with %s\n", "refPep");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "refPep", NULL);
    
    printf("Loading database with %s\n", "refMrna");
    fflush(stdout);
    hgLoadTabFile(conn, ".", "refMrna", NULL);
    }
}
Beispiel #18
0
void rangeTreeAddToCoverageDepth(struct rbTree *tree, int start, int end)
/* Add area from start to end to a tree that is being built up to store the
 * depth of coverage.  Recover coverage back out by looking at ptToInt(range->val)
 * on tree elements. */
{
struct range q;
q.start = start;
q.end = end;

struct range *r, *existing = rbTreeFind(tree, &q);
if (existing == NULL)
    {
    lmAllocVar(tree->lm, r);
    r->start = start;
    r->end = end;
    r->val = intToPt(1);
    rbTreeAdd(tree, r);
    }
else
    {
    if (existing->start <= start && existing->end >= end)
    /* The existing one completely encompasses us */
        {
	/* Make a new section for the bit before start. */
	if (existing->start < start)
	    {
	    lmAllocVar(tree->lm, r);
	    r->start = existing->start;
	    r->end = start;
	    r->val = existing->val;
	    existing->start = start;
	    rbTreeAdd(tree, r);
	    }
	/* Make a new section for the bit after end. */
	if (existing->end > end)
	    {
	    lmAllocVar(tree->lm, r);
	    r->start = end;
	    r->end = existing->end;
	    r->val = existing->val;
	    existing->end = end;
	    rbTreeAdd(tree, r);
	    }
	/* Increment existing section in overlapping area. */
        existing->val = (char *)(existing->val) + 1;
	}
    else
    /* In general case fetch list of regions that overlap us. 
       Remaining cases to handle are: 
	     r >> e     rrrrrrrrrrrrrrrrrrrr
			     eeeeeeeeee

	     e < r           rrrrrrrrrrrrrrr
			eeeeeeeeeeee

	     r < e      rrrrrrrrrrrr
			     eeeeeeeeeeeee
     */
        {
	struct range *existingList = rangeTreeAllOverlapping(tree, start, end);

#ifdef DEBUG
	/* Make sure that list is really sorted for debugging... */
	int lastStart = existingList->start;
	for (r = existingList; r != NULL; r = r->next)
	    {
	    int start = r->start;
	    if (start < lastStart)
	        internalErr();
	    }
#endif /* DEBUG */

	int s = start, e = end;
	for (existing = existingList; existing != NULL; existing = existing->next)
	    {
	    /* Deal with start of new range that comes before existing */
	    if (s < existing->start)
	        {
		lmAllocVar(tree->lm, r);
		r->start = s;
		r->end = existing->start;
		r->val = intToPt(1);
		s = existing->start;
		rbTreeAdd(tree, r);
		}
	    else if (s > existing->start)
	        {
		lmAllocVar(tree->lm, r);
		r->start = existing->start;
		r->end = s;
		r->val = existing->val;
		existing->start = s;
		rbTreeAdd(tree, r);
		}
	    existing->val = (char *)(existing->val) + 1;
	    s = existing->end;
	    }
	if (s < e)
	/* Deal with end of new range that doesn't overlap with anything. */
	    {
	    lmAllocVar(tree->lm, r);
	    r->start = s;
	    r->end = e;
	    r->val = intToPt(1);
	    rbTreeAdd(tree, r);
	    }
	}
    }

}
Beispiel #19
0
static int bedToGffLines(struct bed *bedList, struct slName *exonFramesList, struct hTableInfo *hti,
			 int fieldCount, char *source, boolean gtf2StopCodons)
/* Translate a (list of) bed into gff and print out.
 * Note that field count (perhaps reduced by bitwise intersection)
 * can in effect override hti. */
{
if (! bedList)
    return 0;
struct hash *nameHash = newHash(20);
struct bed *bed;
struct slName *exonFrames = exonFramesList;
int i, exonStart, exonEnd;
char txName[256];
int itemCount = 0;
static int namelessIx = 0;

for (bed = bedList;  bed != NULL;  bed = bed->next)
    {
    /* Enforce unique transcript_ids. */
    if (bed->name != NULL)
	{
	struct hashEl *hel = hashLookup(nameHash, bed->name);
	int dupCount = (hel != NULL ? ptToInt(hel->val) : 0);
	if (dupCount > 0)
	    {
	    safef(txName, sizeof(txName), "%s_dup%d", bed->name, dupCount);
	    hel->val = intToPt(dupCount + 1);
	    }
	else
	    {
	    safef(txName, sizeof(txName), "%s", bed->name);
	    hashAddInt(nameHash, bed->name, 1);
	    }
	}
    else
	safef(txName, sizeof(txName), "tx%d", ++namelessIx);
    if (hti->hasBlocks && hti->hasCDS && fieldCount > 4)
	{
	/* first pass: compute frames, in order dictated by strand. */
	int startIndx = 0, stopIndx = 0;
	char *frames = NULL;
	char *ef = NULL;
	if (exonFramesList)
    	    ef = exonFrames->name;
	frames = computeFrames(bed, ef, &startIndx, &stopIndx);

	/* second pass: one exon (possibly CDS, start/stop_codon) per block. */
	for (i=0;  i < bed->blockCount;  i++)
	    {
	    exonStart = bed->chromStart + bed->chromStarts[i];
	    exonEnd = exonStart + bed->blockSizes[i];
	    if ((exonStart < bed->thickEnd) && (exonEnd > bed->thickStart))
		{
		int exonCdsStart = max(exonStart, bed->thickStart);
		int exonCdsEnd = min(exonEnd, bed->thickEnd);
		addCdsStartStop(bed, source, exonCdsStart, exonCdsEnd,
				frames, i, startIndx, stopIndx, gtf2StopCodons, txName);
		}
	    addGffLineFromBed(bed, source, "exon", exonStart, exonEnd, '.', txName);
	    }
	freeMem(frames);
	}
    else if (hti->hasBlocks && fieldCount > 4)
	{
	for (i=0;  i < bed->blockCount;  i++)
	    {
	    exonStart = bed->chromStart + bed->chromStarts[i];
	    exonEnd = exonStart + bed->blockSizes[i];
	    addGffLineFromBed(bed, source, "exon", exonStart, exonEnd, '.', txName);
	    }
	}
    else if (hti->hasCDS && fieldCount > 4)
	{
	if (bed->thickStart == 0 && bed->thickEnd == 0)
	    bed->thickStart = bed->thickEnd = bed->chromStart;
	if (bed->thickStart > bed->chromStart)
	    {
	    addGffLineFromBed(bed, source, "exon", bed->chromStart, bed->thickStart, '.', txName);
	    }
	if (bed->thickEnd > bed->thickStart)
	    addGffLineFromBed(bed, source, "CDS", bed->thickStart, bed->thickEnd, '0', txName);
	if (bed->thickEnd < bed->chromEnd)
	    {
	    addGffLineFromBed(bed, source, "exon", bed->thickEnd, bed->chromEnd, '.', txName);
	    }
	}
    else
	{
	addGffLineFromBed(bed, source, "exon", bed->chromStart, bed->chromEnd, '.', txName);
	}
    itemCount++;
    if (exonFrames)
    	exonFrames = exonFrames->next;
    }
hashFree(&nameHash);
return itemCount;
}
Beispiel #20
0
static void clusterClone(int argc, char *argv[])
{
int i;

for (i=1; i < argc; ++i)
    {
    struct lineFile *lf;
    struct psl *psl;
    unsigned tSize;
    char *prevAccPart = (char *)NULL;
    char *prevAccName = (char *)NULL;
    char *prevTargetName = (char *)NULL;
    struct hashEl *el;
    struct hash *chrHash = newHash(0);
    struct hash *coordHash = newHash(0);
    struct coordEl *coord;
    struct coordEl **coordListPt = (struct coordEl **) NULL;
    unsigned querySize = 0;
    int partCount = 0;
    int partsConsidered = 0;

    verbose(2,"#\tprocess: %s\n", argv[i]);
    lf=pslFileOpen(argv[i]);
    while ((struct psl *)NULL != (psl = pslNext(lf)) )
	{
	char *accName = (char *)NULL;
	char *targetName = (char *)NULL;
	int chrCount = 0;
	double percentCoverage;

	accName = cloneString(psl->qName);
	if ((char *)NULL == prevAccPart)
	    {
	    prevAccPart = cloneString(psl->qName);  /* first time */
	    querySize = psl->qSize;
	    ++partsConsidered;
	    }
	chopSuffixAt(accName,'_');

	if ((char *)NULL == prevAccName)
		prevAccName = cloneString(accName);  /* first time */
	if ((char *)NULL == prevTargetName)
		prevTargetName = cloneString(psl->tName);  /* first time */

	/*	encountered a new accession name, process the one we
 	 *	were working on
	 */
	if (differentWord(accName, prevAccName))
	    {
	    if (partCount > 0)
		processResult(chrHash, coordHash, prevAccName, querySize,
		    partsConsidered);
	    else
		verbose(1,"# ERROR %s %s - no coordinates found in %d parts considered\n",
		    prevTargetName, prevAccName, partsConsidered);
	    freeMem(prevAccName);
	    prevAccName = cloneString(accName);
	    freeHash(&chrHash);
	    freeHash(&coordHash);
	    chrHash = newHash(0);
	    coordHash = newHash(0);
	    querySize = 0;
	    partCount = 0;
	    partsConsidered = 0;
	    }

	tSize = psl->tEnd - psl->tStart;
	percentCoverage = 100.0*((double)(tSize+1)/(psl->qSize + 1));
	if (differentWord(psl->qName, prevAccPart))
	    {
	    ++partsConsidered;
	    querySize += psl->qSize;
	    freeMem(prevAccPart);
	    prevAccPart = cloneString(psl->qName);
	    }

	targetName = cloneString(psl->tName);
	if (differentWord(targetName, prevTargetName))
	    {
	    freeMem(prevTargetName);
	    prevTargetName = cloneString(targetName);
	    }
	/*	keep a hash of chrom names encountered	*/
	el = hashLookup(chrHash, targetName);
	if (el == NULL)
	    {
	    if (percentCoverage > minCover)
		{
		hashAddInt(chrHash, targetName, 1);
		chrCount = 1;
		}
	    else
		{
		hashAddInt(chrHash, targetName, 0);
		chrCount = 0;
		}
	    }
	else
	    {
	    if (percentCoverage > minCover)
		{
		chrCount = ptToInt(el->val) + 1;
		el->val=intToPt(chrCount);
		}
	    }

	AllocVar(coord);
	coord->start = psl->tStart;
	coord->end = psl->tEnd;
	coord->qSize = psl->qSize;
	coord->strand = sameWord(psl->strand,"+") ? 1 : 0;
	/*	when coverage is sufficient	*/
	if (percentCoverage > minCover)
	    {
	    ++partCount;
	    coord->name = cloneString(psl->qName);
	    /*	for each chrom name, accumulate a list of coordinates */
	    el = hashLookup(coordHash, targetName);
	    if (el == NULL)
		{
		AllocVar(coordListPt);
		hashAdd(coordHash, targetName, coordListPt);
		}
	    else
		{
		coordListPt = el->val;
		}
	    slAddHead(coordListPt,coord);
	verbose(2,"# %s\t%u\t%u\t%u\t%.4f\t%d %s:%d-%d %s\n",
	    psl->qName, psl->qSize, tSize, tSize - psl->qSize,
	    percentCoverage, chrCount, psl->tName, psl->tStart, psl->tEnd,
	    psl->strand);
	    }
	else
	    {
	verbose(3,"# %s\t%u\t%u\t%u\t%.4f\t%d %s:%d-%d %s\n",
	    psl->qName, psl->qSize, tSize, tSize - psl->qSize,
	    percentCoverage, chrCount, psl->tName, psl->tStart, psl->tEnd,
	    psl->strand);
	    }


	freeMem(accName);
	freeMem(targetName);
	pslFree(&psl);
	}
    if (partCount > 0)
	processResult(chrHash, coordHash, prevAccName, querySize,
	    partsConsidered);
    else
	verbose(1,"# ERROR %s %s - no coordinates found\n",
	    prevTargetName, prevAccName);
    freeMem(prevAccName);
    freeHash(&chrHash);
    freeHash(&coordHash);
    lineFileClose(&lf);
    }
}	/*	static void clusterClone()	*/